# Homework 1: Data Preparation and Feature Engineering

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit import Chem 
from rdkit.Chem import AllChem
import mordred
from mordred import Calculator, descriptors

**Question 1:** implement the following function.<br>
*Input*: a list of mordred features.<br>
*Output*: a mordred calculator

In [2]:
def register(features):
    calc = mordred.Calculator() 
    for feature in features:
        calc.register(feature)
    return calc

The following example should work, but it won't do anything interesting.

In [3]:
#register([mordred.AtomCount.AtomCount('C')])

**Question 2:** implement the following function.<br>
*Input*: A SMILES string<br>
*Output*: An RdKit molecule 

In [4]:
def smiles_to_mol(smiles):
    m = Chem.MolFromSmiles(smiles)
    m = Chem.AddHs(m)
    AllChem.EmbedMolecule(m)
    AllChem.MMFFOptimizeMolecule(m)
    return m

The following example should work and plot CH4

In [5]:
#smiles_to_mol('C')

**Question 3:** implement the following function.<br>
*Input*: pandas dataframe, list of features, a smiles_to_mol function <br>
*Output* a new pandas dataframe with features: 

In [10]:
def featurize(df,features,smiles_to_mol):
        
    calc = register(features)
    df_res = df['SMILES'].apply(smiles_to_mol)
    df_res = calc.pandas(df_res)
    df_res['Solubility'] = df['Solubility']
    
    return df_res

The following example should work and display part of a table with some features. First, let's load some data.

In [11]:
test_data = pd.read_csv('../../Data/Solubility/dataset-E.csv')

In [12]:
test_features = featurize(test_data,[mordred.AtomCount.AtomCount('C')],smiles_to_mol)

100%|██████████| 1291/1291 [00:02<00:00, 514.76it/s]


In [13]:
test_features

Unnamed: 0,nC,Solubility
0,5,-3.18
1,5,-2.64
2,6,-3.84
3,6,-3.74
4,6,-3.55
...,...,...
1286,10,-3.37
1287,9,-5.49
1288,20,-2.47
1289,14,-7.15


**Question 4:** implement the following function.<br>
*Input*: pandas dataframe, number of features to display <br>
*Output:* list of n most correlated features, list of n most anticorrelated features: 

In [18]:
def correlates(fdf,n):
    corr = fdf.corrwith(fdf['Solubility'])
    print(corr.nlargest(n))
    print(corr.nsmallest(n))

The following example should work and return two lists of features with their correlation to Solubility.

In [19]:
correlates(test_features,3)

Solubility    1.000000
nC           -0.591717
dtype: float64
nC           -0.591717
Solubility    1.000000
dtype: float64


**Question 5:** implement the following function.<br>
*Input*: pandas dataframe, single feature<br>
*Output*: none, the function should plot the solubility as a function of the value of the feature

In [13]:
def plot_feature(fdf,feature):
    ldf = pd.DataFrame()
    ldf[feature] = fdf[feature]
    ldf['Solubility'] = fdf['Solubility']
    ldf.sort_values(feature)
    plt.scatter(ldf[feature],ldf['Solubility'])

In [14]:
#plot_feature(test_features,'nC')

**Question 6:** implement the following function.<br>
*Input*: path to dataset, list of features, smiles_to_mol function<br>
*Output*: none, the function should dump two csv files, one for the features, one for the solubility

In [15]:
def prepare_dataset(dataset,features,smiles_to_mol,path):
    df = pd.read_csv(dataset)
    df_f = featurize(df,features,smiles_to_mol)
    df['Solubility'].to_csv(path+"solubility.csv")
    df_f.to_csv(path+"features.csv")

The following example should work and create two new files on your directory.

In [16]:
#prepare_dataset('../../Data/Solubility/dataset-E.csv',[mordred.AtomCount.AtomCount('C')],smiles_to_mol)

In [17]:
features = [
    mordred.AtomCount.AtomCount("X"),
    mordred.AtomCount.AtomCount("HeavyAtom"),
    mordred.Aromatic.AromaticAtomsCount,
    
    mordred.HydrogenBond.HBondAcceptor,
    mordred.HydrogenBond.HBondDonor,
    mordred.RotatableBond.RotatableBondsCount, 
    mordred.BondCount.BondCount("any", False),
    mordred.Aromatic.AromaticBondsCount,
    mordred.BondCount.BondCount("heavy", False),      
    mordred.BondCount.BondCount("single", False),
    mordred.BondCount.BondCount("double", False),
    mordred.BondCount.BondCount("triple", False)    
]

In [None]:
train='./train/'
test='./test/'
prepare_dataset('../../Data/Solubility/dataset-F.csv',features,smiles_to_mol, train)
prepare_dataset('../../Data/Solubility/dataset-E.csv',features,smiles_to_mol, test)

## Towards the milestone project

The functions you have implemented will play a key role for the milestone project. First, you should use them to explore the data and select good features. Note that the following command will let you play with all of the available features, ignoring the 3D ones. 

In [148]:
Calculator(descriptors, ignore_3D=True)

<mordred._base.calculator.Calculator at 0x7fc8d1540dd0>

Second, once you have selected your features, you will run the prepare_dataset function to create the data that you will use for the upcoming homeworks. 