### ToDo
- split into train/validation/test according to paper
- calculate fingerprints & despriptors
- concatenate 
- normalize between 0 and 1

### Imports

In [1]:
import pandas as pd
import numpy as np
#np.set_printoptions(threshold=np.inf)

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Descriptors, rdFingerprintGenerator

from statsmodels.distributions.empirical_distribution import ECDF

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
# ToxCast
# 8.6k mols, 618 tasks, 8,615 measurements

toxcast = pd.read_csv("datasets/toxcast_data.csv")

print('Shape: ', toxcast.shape)

vals = toxcast.values.flatten()
print('# meas.: ', len([v for v in vals if str(v) != 'nan']))

toxcast.head()

Shape:  (8615, 618)
# meas.:  1547010


Unnamed: 0,smiles,ACEA_T47D_80hr_Negative,ACEA_T47D_80hr_Positive,APR_HepG2_CellCycleArrest_24h_dn,APR_HepG2_CellCycleArrest_24h_up,APR_HepG2_CellCycleArrest_72h_dn,APR_HepG2_CellLoss_24h_dn,APR_HepG2_CellLoss_72h_dn,APR_HepG2_MicrotubuleCSK_24h_dn,APR_HepG2_MicrotubuleCSK_24h_up,...,Tanguay_ZF_120hpf_OTIC_up,Tanguay_ZF_120hpf_PE_up,Tanguay_ZF_120hpf_PFIN_up,Tanguay_ZF_120hpf_PIG_up,Tanguay_ZF_120hpf_SNOU_up,Tanguay_ZF_120hpf_SOMI_up,Tanguay_ZF_120hpf_SWIM_up,Tanguay_ZF_120hpf_TRUN_up,Tanguay_ZF_120hpf_TR_up,Tanguay_ZF_120hpf_YSE_up
0,[O-][N+](=O)C1=CC=C(Cl)C=C1,0.0,0.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C[SiH](C)O[Si](C)(C)O[Si](C)(C)O[SiH](C)C,,,,,,,,,,...,,,,,,,,,,
2,CN1CCN(CC1)C(=O)C1CCCCC1,,,,,,,,,,...,,,,,,,,,,
3,NC1=CC=C(C=C1)[N+]([O-])=O,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OC1=CC=C(C=C1)[N+]([O-])=O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature computation

In [3]:
def smile_to_mol(smiles_list):
    molecules = list()

    for smiles in smiles_list:
        
        molecule = Chem.MolFromSmiles(smiles)
        if molecule == None:
            continue
        molecules.append(molecule)
    
    return molecules

In [7]:
def generate_ecfps(molecules):
    ecfps = list()

    for molecule in molecules:
        fp_sparseVec = rdFingerprintGenerator.GetCountFPs(
                        [molecule], fpType=rdFingerprintGenerator.MorganFP
                       )[0]
        fp = np.zeros((0,), np.int8)  # Generate target pointer to fill
        DataStructs.ConvertToNumpyArray(fp_sparseVec, fp)

        ecfps.append(fp)

    ecfps = np.array(ecfps)

    return ecfps
#ecfps = generate_ecfps(molecules)

In [6]:
def calc_descriptors(molecules, descriptor_ids):
    rdkit_descriptors = list()
    
    for molecule in molecules:
        descrs = list()
        for descr in Descriptors._descList:
            _, descr_calc_fn = descr
            try:
                descrs.append(descr_calc_fn(molecule))
            except:
                pass
                #print('Failed to compute', molecule)

        descrs = np.array(descrs)
        descrs = descrs[descriptor_ids]
        rdkit_descriptors.append(descrs)

    return np.array(rdkit_descriptors)
#rdkit_descriptors = calc_descriptors(molecules, real_200_descr)

In [8]:
def calc_descriptors_quantils(rdkit_descriptors):
    rdkit_descriptors_quantils = np.zeros_like(rdkit_descriptors)

    for column in range(rdkit_descriptors.shape[1]):
        raw_values_ecdf = rdkit_descriptors[:,column].reshape(-1)
        raw_values = rdkit_descriptors[:,column]#.reshape(-1)

        ecdf = ECDF(raw_values_ecdf)
        quantils = ecdf(raw_values)
        rdkit_descriptors_quantils[:,column] = quantils
        
        #print(ecdf)
        
    return rdkit_descriptors_quantils
#rdkit_descriptors_quantils = calc_descriptors_quantils(rdkit_descriptors)

In [9]:
example_molecules = [
    'CC1CC2C3CCC4=CC(=O)C=CC4([C@]3(C(CC2([C@]1(C(=O)CCl)O)C)O)F)C',
    'CCCCCCOC(=O)N=C(C1=CC=C(C=C1)NCC2=NC3=C(N2C)C=CC(=C3)C(=O)N(CCC(=O)OCC)C4=CC=CC=N4)N',
    'CSCCC(C(=O)NCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CCSC)C(=O)NC(CC(=O)O)C(=O)NC(CC3=CC=CC=C3)C(=O)N)NC(=O)C(CC4=CC=C(C=C4)OS(=O)(=O)O)NC(=O)C(CC(=O)O)N'
]

real_200_descr = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,
                  51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
                  91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,
                  124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,
                  155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,
                  186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207]

#example_molecules = list(toxcast["smiles"])

molecules = smile_to_mol(example_molecules)
ecfps = generate_ecfps(molecules)
rdkit_descriptors = calc_descriptors(molecules, real_200_descr)
rdkit_descriptors_quantils = calc_descriptors_quantils(rdkit_descriptors)

In [10]:
np.set_printoptions(threshold=10)
for i in range(5):
    print(rdkit_descriptors[:, i])
    print(rdkit_descriptors_quantils[:, i])
    print()

[16.8823829  13.62121553 14.3419037 ]
[1.         0.33333333 0.66666667]

[-1.95095014 -0.69072078 -4.87391061]
[0.66666667 1.         0.33333333]

[16.8823829  13.62121553 14.3419037 ]
[1.         0.33333333 0.66666667]

[0.01508732 0.04554354 0.01318461]
[0.66666667 1.         0.33333333]

[0.68564656 0.07414882 0.02458486]
[1.         0.66666667 0.33333333]



In [None]:
# report quantils for rdkit descriptors
#descriptors_raw_forECDF =  rdkit descriptors of training set

In [None]:
# standardize features