## **Importing Libraries**

In [None]:
%%capture
!pip install mordred
!pip install rdkit


In [None]:
# Importing Libraries
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem

import mordred
from mordred import Calculator, descriptors
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

pd.set_option('display.max_columns', 2000)
warnings.filterwarnings("ignore")

In [None]:
sns.set(style='whitegrid')

## **Generating Descriptors from SMILES - Mordred**

Youtube videos for generating molecular descriptors from SMILES

> https://youtu.be/EeOCGvy2pmQ


> https://youtu.be/Sgh-qJVYI5Q





In [None]:
data = pd.read_csv('delaney.csv')
data.head()

In [None]:
mol_list = []

for smile in data['SMILES']:
  mol = Chem.MolFromSmiles(smile)
  mol = Chem.AddHs(mol)
  AllChem.EmbedMolecule(mol)
  mol_list.append(mol)

data = pd.concat([data, pd.DataFrame(mol_list, columns = (['Mol']))], axis=1)

In [None]:
data.head()

In [None]:
mol = data['Mol'][54]

In [None]:
img = Draw.MolToImage(mol)
img

In [None]:
# Creating a descriptor calculator with all descriptors
calc = Calculator(descriptors, ignore_3D=False)

all_desc = calc.pandas(data['Mol'])

In [None]:
all_desc.head()

In [None]:
all_desc.shape

In [None]:
data.head()

In [None]:
df_index = data[['Compound ID', 'SMILES', 'measured log(solubility:mol/L)']]

In [None]:
df = pd.concat([df_index, all_desc], axis=1)
df.head()

In [None]:
df.to_excel('delaney_mordred.xlsx', index=None)

## **Loading the Dataset**

In [None]:
df = pd.read_excel('delaney_mordred.xlsx')

## **Data Preprocessing**



1.   Removing missing values/non-numerical values
2.   Remove constant values
3.   Remove highly correlated values





In [None]:
df.head()

In [None]:
data = df.iloc[:,3:]

In [None]:
data.head()

In [None]:
data.isnull().sum().sum()

In [None]:
column_num = []
column_bool = []
for column in data.columns:
  column_type = data[column].dtype
  if column_type == 'object':
      pass
  elif column_type =='bool':
      column_bool.append(column)
  else:
      column_num.append(column)

In [None]:
len(column_num)

In [None]:
column_bool

In [None]:
data['GhoseFilter'].unique()

In [None]:
gf = data['GhoseFilter'].astype(int)
gf.value_counts().plot(kind='bar')


In [None]:
data = data[column_num + column_bool]

In [None]:
data.shape

In [None]:
def remove_constant_values(data):
    return [e for e in data.columns if data[e].nunique() == 1]

drop_col = remove_constant_values(data)
#drop_col

new_df_columns = [e for e in data.columns if e not in drop_col]
new_df = data[new_df_columns]
new_df

In [None]:
len(drop_col)

In [None]:

# To calclulate  Correlation and remove highly  correlated columns
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
# Dropping highly correlated Features
corr_features = correlation(new_df, 0.80)
print("No. of features to drop : ",len(set(corr_features)))

new_df.drop(corr_features,axis=1,inplace=True)

In [None]:
new_df.shape

In [None]:
new_df.head()

In [None]:
new_df['Lipinski'] = df["Lipinski"].astype(int)
new_df['GhoseFilter'] = df["GhoseFilter"].astype(int)

In [None]:
new_df.head()

In [None]:
df_final = pd.concat([df.iloc[:, :3], new_df], axis=1)
df_final.head()

In [None]:
df_final.to_csv('delaney_mordred_truncated.csv', index=None)

## **Data Analysis**

In [None]:
df_final['measured log(solubility:mol/L)'].describe()

In [None]:
plt.hist(df_final['measured log(solubility:mol/L)'])

In [None]:
corr = df_final.corr()
corr

In [None]:
corr_sorted = abs(corr[['measured log(solubility:mol/L)']]).sort_values(by ='measured log(solubility:mol/L)', ascending=False)
corr_sorted = corr_sorted.iloc[1:5, :]
corr_sorted.rename(columns={'measured log(solubility:mol/L)' : 'correlation_coef'}, inplace=True)
corr_sorted

In [None]:
fig = plt.figure(1, figsize=(6,6))
ax1 = fig.add_subplot(111)
plt.bar(x = corr_sorted.index, height = corr_sorted['correlation_coef'], color = 'green')
ax1.set_xlabel('Top Correlated Descriptors', weight='bold')
ax1.set_ylabel('Correlation Coefficient', weight='bold')

In [None]:
fig = plt.figure(4, figsize=(10,10))
ax = fig.add_subplot(221)
plt.scatter(x = df_final['measured log(solubility:mol/L)'], y = df_final['FilterItLogS'], color = 'green')
ax.set_xlabel('Log Solubilities', weight='bold')
ax.set_ylabel('FilterItLogS', weight='bold')

ax = fig.add_subplot(222)
plt.scatter(x = df_final['measured log(solubility:mol/L)'], y = df_final['PEOE_VSA6'], color = 'green')
ax.set_xlabel('Log Solubilities', weight='bold')
ax.set_ylabel('PEOE_VSA6', weight='bold')

ax = fig.add_subplot(223)
plt.scatter(x = df_final['measured log(solubility:mol/L)'], y = df_final['RNCG'], color = 'green')
ax.set_xlabel('Log Solubilities', weight='bold')
ax.set_ylabel('RNCG', weight='bold')

ax = fig.add_subplot(224)
plt.scatter(x = df_final['measured log(solubility:mol/L)'], y = df_final['ABC'], color = 'green')
ax.set_xlabel('Log Solubilities', weight='bold')
ax.set_ylabel('ABC', weight='bold')
plt.tight_layout()

In [None]:
df_final.head()