In [1]:
# install package
!pip install padelpy



In [17]:
# import package
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from padelpy import from_smiles

# 1 Load data

In [2]:
df = pd.read_excel("CNSData(940).xlsx")
df

Unnamed: 0,ID,name,smiles,output
0,1,ABACAVIRSULFATE,Nc1nc(NC2CC2)c3ncn(C4CC(CO)C=C4)c3n1,0
1,2,ACARBOSE,CC1OC(OC2C(O)C(O)C(OC3C(O)C(O)C(O)OC3CO)OC2CO)...,0
2,3,ACEBUTOLOL,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C,0
3,4,ACECAINIDE,CCN(CC)CCNC(=O)c1ccc(NC(=O)C)cc1,0
4,5,ACECLOFENAC,OC(=O)COC(=O)Cc1ccccc1Nc2c(Cl)cccc2Cl,0
...,...,...,...,...
935,936,ZONISAMIDE,c1ccc2c(c1)c(no2)CS(=O)(=O)N,1
936,937,ZOPICLONE,CN1CCN(CC1)C(=O)OC2c3c(nccn3)C(=O)N2c4ccc(cn4)Cl,1
937,938,ZOTEPINE,CN(C)CCOC1=Cc2ccccc2Sc3c1cc(cc3)Cl,1
938,939,ZOTEPINE,CN(C)CCOC1=Cc2cc(ccc2Sc3c1cccc3)Cl,1


# 2 SMILES to Descriptors/Fingerprints

In [3]:
pd.DataFrame(from_smiles("CCC", fingerprints=True, threads=1), index=[1])

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
1,0,-0.1853999999999998,0.0343731599999999,14.709,10.614344,0,0,11,3,8,...,0,0,0,0,0,0,0,0,0,0


In [4]:
desc_list = []
for idx in tqdm(df.index):
    try:
        smiles = df.loc[idx, 'smiles']
        desc = from_smiles(smiles, fingerprints=True, threads=16, timeout=120)
    except KeyboardInterrupt:
        break
    except:
        desc = dict()
    desc_list.append(desc)

df_desc = pd.DataFrame(desc_list)
df_desc

  0%|          | 0/940 [00:00<?, ?it/s]

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,0,-1.8652999999999995,3.4793440899999983,47.889100000000006,44.04427399999998,9,10,39,21,18,...,0,0,0,0,0,0,0,0,0,0
1,0,-7.226299999999998,52.21941168999998,137.768,88.20809899999992,0,0,87,44,43,...,0,0,0,0,0,0,0,0,0,0
2,0,-2.1004,4.41168016,67.07270000000001,55.758203999999964,6,6,52,24,28,...,0,0,0,0,0,0,0,0,0,0
3,0,-0.5178999999999999,0.2682204099999999,52.610800000000005,46.64023899999998,6,6,43,20,23,...,0,0,0,0,0,0,0,0,0,0
4,1,0.5702999999999996,0.32524208999999954,37.1534,45.49630899999999,12,12,36,23,13,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,0,-1.079,1.1642409999999999,17.4729,26.920343999999986,9,10,22,14,8,...,0,0,0,0,0,0,0,0,0,0
936,0,0.3195999999999998,0.10214415999999986,52.47090000000001,52.44148099999998,12,12,44,27,17,...,0,0,0,0,0,0,0,0,0,0
937,0,1.4427999999999992,2.0816718399999976,48.39030000000001,50.66427399999998,12,12,40,22,18,...,0,0,0,0,0,0,0,0,0,0
938,0,1.4427999999999992,2.0816718399999976,48.39030000000001,50.66427399999998,12,12,40,22,18,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_merge = df.join(df_desc)
df_merge

Unnamed: 0,ID,name,smiles,output,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,ABACAVIRSULFATE,Nc1nc(NC2CC2)c3ncn(C4CC(CO)C=C4)c3n1,0,0,-1.8652999999999995,3.4793440899999983,47.889100000000006,44.04427399999998,9,...,0,0,0,0,0,0,0,0,0,0
1,2,ACARBOSE,CC1OC(OC2C(O)C(O)C(OC3C(O)C(O)C(O)OC3CO)OC2CO)...,0,0,-7.226299999999998,52.21941168999998,137.768,88.20809899999992,0,...,0,0,0,0,0,0,0,0,0,0
2,3,ACEBUTOLOL,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C,0,0,-2.1004,4.41168016,67.07270000000001,55.758203999999964,6,...,0,0,0,0,0,0,0,0,0,0
3,4,ACECAINIDE,CCN(CC)CCNC(=O)c1ccc(NC(=O)C)cc1,0,0,-0.5178999999999999,0.2682204099999999,52.610800000000005,46.64023899999998,6,...,0,0,0,0,0,0,0,0,0,0
4,5,ACECLOFENAC,OC(=O)COC(=O)Cc1ccccc1Nc2c(Cl)cccc2Cl,0,1,0.5702999999999996,0.32524208999999954,37.1534,45.49630899999999,12,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,936,ZONISAMIDE,c1ccc2c(c1)c(no2)CS(=O)(=O)N,1,0,-1.079,1.1642409999999999,17.4729,26.920343999999986,9,...,0,0,0,0,0,0,0,0,0,0
936,937,ZOPICLONE,CN1CCN(CC1)C(=O)OC2c3c(nccn3)C(=O)N2c4ccc(cn4)Cl,1,0,0.3195999999999998,0.10214415999999986,52.47090000000001,52.44148099999998,12,...,0,0,0,0,0,0,0,0,0,0
937,938,ZOTEPINE,CN(C)CCOC1=Cc2ccccc2Sc3c1cc(cc3)Cl,1,0,1.4427999999999992,2.0816718399999976,48.39030000000001,50.66427399999998,12,...,0,0,0,0,0,0,0,0,0,0
938,939,ZOTEPINE,CN(C)CCOC1=Cc2cc(ccc2Sc3c1cccc3)Cl,1,0,1.4427999999999992,2.0816718399999976,48.39030000000001,50.66427399999998,12,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_merge.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 2760 columns):
 #     Column             Non-Null Count  Dtype 
---    ------             --------------  ----- 
 0     ID                 940 non-null    int64 
 1     name               940 non-null    object
 2     smiles             940 non-null    object
 3     output             940 non-null    int64 
 4     nAcid              940 non-null    object
 5     ALogP              940 non-null    object
 6     ALogp2             940 non-null    object
 7     AMR                940 non-null    object
 8     apol               940 non-null    object
 9     naAromAtom         940 non-null    object
 10    nAromBond          940 non-null    object
 11    nAtom              940 non-null    object
 12    nHeavyAtom         940 non-null    object
 13    nH                 940 non-null    object
 14    nB                 940 non-null    object
 15    nC                 940 non-null    object
 16    nN  

In [6]:
df_merge.to_csv('CNSData(940)_descriptors.csv', index=False)

# missing values

In [58]:
# df_clean = df_merge.loc[:, 'nAcid':].applymap(lambda x: np.float16(x), na_action='ignore')
df_clean = df_merge.copy()
for col in df_merge.loc[:, 'nAcid':].columns:
    df_clean[col] = pd.to_numeric(df_merge[col], errors='coerce')
df_clean

Unnamed: 0,ID,name,smiles,output,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,ABACAVIRSULFATE,Nc1nc(NC2CC2)c3ncn(C4CC(CO)C=C4)c3n1,0,0,-1.8653,3.479344,47.8891,44.044274,9,...,0,0,0,0,0,0,0,0,0,0
1,2,ACARBOSE,CC1OC(OC2C(O)C(O)C(OC3C(O)C(O)C(O)OC3CO)OC2CO)...,0,0,-7.2263,52.219412,137.7680,88.208099,0,...,0,0,0,0,0,0,0,0,0,0
2,3,ACEBUTOLOL,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C,0,0,-2.1004,4.411680,67.0727,55.758204,6,...,0,0,0,0,0,0,0,0,0,0
3,4,ACECAINIDE,CCN(CC)CCNC(=O)c1ccc(NC(=O)C)cc1,0,0,-0.5179,0.268220,52.6108,46.640239,6,...,0,0,0,0,0,0,0,0,0,0
4,5,ACECLOFENAC,OC(=O)COC(=O)Cc1ccccc1Nc2c(Cl)cccc2Cl,0,1,0.5703,0.325242,37.1534,45.496309,12,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,936,ZONISAMIDE,c1ccc2c(c1)c(no2)CS(=O)(=O)N,1,0,-1.0790,1.164241,17.4729,26.920344,9,...,0,0,0,0,0,0,0,0,0,0
936,937,ZOPICLONE,CN1CCN(CC1)C(=O)OC2c3c(nccn3)C(=O)N2c4ccc(cn4)Cl,1,0,0.3196,0.102144,52.4709,52.441481,12,...,0,0,0,0,0,0,0,0,0,0
937,938,ZOTEPINE,CN(C)CCOC1=Cc2ccccc2Sc3c1cc(cc3)Cl,1,0,1.4428,2.081672,48.3903,50.664274,12,...,0,0,0,0,0,0,0,0,0,0
938,939,ZOTEPINE,CN(C)CCOC1=Cc2cc(ccc2Sc3c1cccc3)Cl,1,0,1.4428,2.081672,48.3903,50.664274,12,...,0,0,0,0,0,0,0,0,0,0


In [59]:
df_clean.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 2760 columns):
 #     Column             Non-Null Count  Dtype  
---    ------             --------------  -----  
 0     ID                 940 non-null    int64  
 1     name               940 non-null    object 
 2     smiles             940 non-null    object 
 3     output             940 non-null    int64  
 4     nAcid              940 non-null    int64  
 5     ALogP              939 non-null    float64
 6     ALogp2             939 non-null    float64
 7     AMR                939 non-null    float64
 8     apol               940 non-null    float64
 9     naAromAtom         940 non-null    int64  
 10    nAromBond          940 non-null    int64  
 11    nAtom              940 non-null    int64  
 12    nHeavyAtom         940 non-null    int64  
 13    nH                 940 non-null    int64  
 14    nB                 940 non-null    int64  
 15    nC                 940 non-null    i

In [60]:
# drop na columns
df_clean = df_clean.dropna(axis='columns')
df_clean.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 2225 columns):
 #     Column            Non-Null Count  Dtype  
---    ------            --------------  -----  
 0     ID                940 non-null    int64  
 1     name              940 non-null    object 
 2     smiles            940 non-null    object 
 3     output            940 non-null    int64  
 4     nAcid             940 non-null    int64  
 5     apol              940 non-null    float64
 6     naAromAtom        940 non-null    int64  
 7     nAromBond         940 non-null    int64  
 8     nAtom             940 non-null    int64  
 9     nHeavyAtom        940 non-null    int64  
 10    nH                940 non-null    int64  
 11    nB                940 non-null    int64  
 12    nC                940 non-null    int64  
 13    nN                940 non-null    int64  
 14    nO                940 non-null    int64  
 15    nS                940 non-null    int64  
 16    nP  

In [61]:
# drop std0 columns
feat_cols = df_clean.loc[:, 'nAcid':].columns
keep_cols = ['ID', 'name', 'smiles'] + feat_cols[df_clean.loc[:, feat_cols].std(axis=0) != 0].tolist()
df_clean.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 2225 columns):
 #     Column            Non-Null Count  Dtype  
---    ------            --------------  -----  
 0     ID                940 non-null    int64  
 1     name              940 non-null    object 
 2     smiles            940 non-null    object 
 3     output            940 non-null    int64  
 4     nAcid             940 non-null    int64  
 5     apol              940 non-null    float64
 6     naAromAtom        940 non-null    int64  
 7     nAromBond         940 non-null    int64  
 8     nAtom             940 non-null    int64  
 9     nHeavyAtom        940 non-null    int64  
 10    nH                940 non-null    int64  
 11    nB                940 non-null    int64  
 12    nC                940 non-null    int64  
 13    nN                940 non-null    int64  
 14    nO                940 non-null    int64  
 15    nS                940 non-null    int64  
 16    nP  

  sqr = _ensure_numeric((avg - values) ** 2)


In [64]:
clean_feats_columns = df_clean.loc[:, 'nAcid':].columns
print("clean_feats_columns =", clean_feats_columns.tolist())

clean_feats_columns = ['nAcid', 'apol', 'naAromAtom', 'nAromBond', 'nAtom', 'nHeavyAtom', 'nH', 'nB', 'nC', 'nN', 'nO', 'nS', 'nP', 'nF', 'nCl', 'nBr', 'nI', 'nX', 'ATS0m', 'ATS1m', 'ATS2m', 'ATS3m', 'ATS4m', 'ATS5m', 'ATS6m', 'ATS7m', 'ATS8m', 'ATS0v', 'ATS1v', 'ATS2v', 'ATS3v', 'ATS4v', 'ATS5v', 'ATS6v', 'ATS7v', 'ATS8v', 'ATS0e', 'ATS1e', 'ATS2e', 'ATS3e', 'ATS4e', 'ATS5e', 'ATS6e', 'ATS7e', 'ATS8e', 'ATS0p', 'ATS1p', 'ATS2p', 'ATS3p', 'ATS4p', 'ATS5p', 'ATS6p', 'ATS7p', 'ATS8p', 'ATS0i', 'ATS1i', 'ATS2i', 'ATS3i', 'ATS4i', 'ATS5i', 'ATS6i', 'ATS7i', 'ATS8i', 'ATS0s', 'ATS1s', 'ATS2s', 'ATS3s', 'ATS4s', 'ATS5s', 'ATS6s', 'ATS7s', 'ATS8s', 'AATS0m', 'AATS1m', 'AATS2m', 'AATS3m', 'AATS4m', 'AATS5m', 'AATS6m', 'AATS7m', 'AATS8m', 'AATS0v', 'AATS1v', 'AATS2v', 'AATS3v', 'AATS4v', 'AATS5v', 'AATS6v', 'AATS7v', 'AATS8v', 'AATS0e', 'AATS1e', 'AATS2e', 'AATS3e', 'AATS4e', 'AATS5e', 'AATS6e', 'AATS7e', 'AATS8e', 'AATS0p', 'AATS1p', 'AATS2p', 'AATS3p', 'AATS4p', 'AATS5p', 'AATS6p', 'AATS7p', 

In [62]:
df_clean.to_csv('CNSData(940)_descriptors_clean.csv', index=False)