# Homework 1: Data Preparation and Feature Selection

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
import mordred
from mordred import Calculator, descriptors

In [2]:
def register(features):
    calc = mordred.Calculator() 
    for feature in features:
        calc.register(feature)
    return calc

In [3]:
def smiles_to_mol(smiles):
    m = Chem.MolFromSmiles(smiles)
    m = Chem.AddHs(m)
    AllChem.EmbedMolecule(m)
    AllChem.MMFFOptimizeMolecule(m)
    return m

In [4]:
def featurize(df,features,smiles_to_mol):
        
    calc = register(features)
    df_res = df['SMILES'].apply(smiles_to_mol)
    df_res = calc.pandas(df_res)
    #df_res['Solubility'] = df['Solubility']
    
    return df_res

In [5]:
def correlates(fdf):
    corr = fdf.corrwith(df['Solubility'])
    print(corr.nlargest(10))
    print(corr.nsmallest(10))

In [6]:
def plot_feature(fdf,feature):
    ldf = pd.DataFrame()
    ldf[feature] = fdf[feature]
    ldf['Solubility'] = fdf['Solubility']
    ldf.sort_values(feature)
    plt.scatter(ldf[feature],ldf['Solubility'])

In [7]:
def prepare_dataset(dataset,features,smiles_to_mol):
    df = pd.read_csv('../../Data/Solubility/dataset-' + dataset + '.csv')
    df_f = featurize(df,features,smiles_to_mol)
    df['Solubility'].to_csv('../../Data/Solubility/' + dataset + '_y',index=False)
    df_f.to_csv('../../Data/Solubility/' + dataset + '_X',index=False)

In [8]:
df = pd.read_csv('../../Data/Solubility/dataset-E.csv')

In [9]:
features = [
    mordred.AtomCount.AtomCount("X"),
    mordred.AtomCount.AtomCount("HeavyAtom"),
    mordred.Aromatic.AromaticAtomsCount,
    
    mordred.HydrogenBond.HBondAcceptor,
    mordred.HydrogenBond.HBondDonor,
    mordred.RotatableBond.RotatableBondsCount, 
    mordred.BondCount.BondCount("any", False),
    mordred.Aromatic.AromaticBondsCount,
    mordred.BondCount.BondCount("heavy", False),      
    mordred.BondCount.BondCount("single", False),
    mordred.BondCount.BondCount("double", False),
    mordred.BondCount.BondCount("triple", False)    
]

In [10]:
fdf = featurize(df,features,smiles_to_mol) 

100%|██████████| 1291/1291 [00:01<00:00, 872.39it/s] 


In [11]:
correlates(fdf)

nHBDon       0.221907
nHBAcc       0.089930
nBondsD     -0.003289
nBondsT     -0.024682
nRot        -0.156493
nBondsS     -0.238614
nBonds      -0.434277
nX          -0.504571
nAromAtom   -0.518627
nAromBond   -0.521050
dtype: float64
nBondsO      -0.572487
nHeavyAtom   -0.565610
nAromBond    -0.521050
nAromAtom    -0.518627
nX           -0.504571
nBonds       -0.434277
nBondsS      -0.238614
nRot         -0.156493
nBondsT      -0.024682
nBondsD      -0.003289
dtype: float64


In [13]:
#plot_feature(fdf,'nHeavyAtom')

In [14]:
prepare_dataset('E',features,smiles_to_mol)

100%|██████████| 1291/1291 [00:01<00:00, 1071.18it/s]


In [15]:
prepare_dataset('F',features,smiles_to_mol)

100%|██████████| 1210/1210 [00:01<00:00, 938.68it/s] 
