Try various regression models (on conformer entropy) from RDKit fingerprints (ECFP4 and ECFP6)

Parts based on:
https://github.com/dkoes/qsar-tools/blob/master/trainlinearmodel.py

Some parts adapted from Dan Elton
http://moreisdifferent.com/2017/9/21/DIY-Drug-Discovery-using-molecular-fingerprints-and-machine-learning-for-solubility-prediction/
https://github.com/delton137

In [1]:
from __future__ import print_function

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")

import numpy as np
import pandas as pd
import sys
import pickle

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import *



In [2]:
df = pd.read_csv("total-entropy.csv")
# drop inf and nan (i.e. some molecules from COD don't have Gasteiger charges)
df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)
print(len(df.index))
# 115599 molecules left
#  (technically we should check to make sure all the SMILES are unique!)

115599


In [3]:
# what do we have
print(df.columns)
df = df.astype({"NumAtoms": int, "NumBonds": int, "NumRotors": int, "NumMethyl": int, "NumAmine": int, "NumHydroxyl": int, "HDonors": int, "HAcceptors": int, "RingCount": int, "NumAromaticRings": int})

Index(['Category', 'File', 'SMILES', 'ConfEntropy', 'VibEntropy', 'RotEntropy',
       'TransEntropy', 'NumAtoms', 'NumBonds', 'ExactMolWt', 'Volume',
       'NumRotorsStrict', 'NumRotors', 'NumMethyl', 'NumAmine', 'NumHydroxyl',
       'HDonors', 'HAcceptors', 'RingCount', 'NumAromaticRings',
       'MaxAbsPartialChg', 'MinAbsPartialChg', 'MaxPartialChg',
       'MinPartialChg', 'TPSA', 'LabuteASA', 'MolMR', 'MolLogP', 'EState_VSA1',
       'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5',
       'HallKierAlpha', 'BertzCT', 'BalabanJ', 'Ipc', 'Kappa1', 'Kappa2',
       'Kappa3', 'FractionCSP3', 'NumBridgeheadAtoms', 'NumSpiroAtoms',
       'Asphericity', 'Eccentricity', 'InertialShapeFactor',
       'RadiusOfGyration', 'SpherocityIndex', 'ConfUnder1', 'ConfUnder2',
       'ConfUnder3', 'ConfUnder4', 'ConfUnder5', 'ConfUnder6', 'ECFP4',
       'ECFP6'],
      dtype='object')


In [4]:
def FPBase64ToNumpy( fps ):
    X = []
    for item in fps:
        bv = DataStructs.ExplicitBitVect(4096)
        DataStructs.ExplicitBitVect.FromBase64(bv, item)
        arr = np.zeros( (1,) )
        DataStructs.ConvertToNumpyArray( bv, arr )
        X.append(arr)
    return X

In [5]:
X = FPBase64ToNumpy(df.ECFP6)
Y = df.ConfEntropy

In [6]:
# sometimes we need to do our own CV
def test_model_cv(model, x, y, cv=20):
    scores = cross_validation.cross_val_score(model, x, y, cv=cv, n_jobs=1, 
                                            scoring='mean_absolute_error')
    return scores.mean()

We'll start by evaluating several models with built-in scikit-learn CV functions:
- Ridge regression
- Lasso
- ElasticNet

In [None]:
RidgeModel = RidgeCV(scoring='neg_mean_absolute_error').fit(X,Y)

RidgeModel = RidgeModel.fit(X, Y)
mean_absolute_error(Y, RidgeModel.predict(X))

In [None]:
Lasso = LassoCV(cv=3).fit(X, Y)
mean_absolute_error(Y, Lasso.predict(X))

In [None]:
LassoLars = LassoLarsCV(cv=3).fit(X, Y)
mean_absolute_error(Y, LassoLars.predict(X))

In [None]:
ElasticModel = ElasticNetCV(cv=3, verbose=1, n_jobs=1).fit(X,Y)
mean_absolute_error(Y, ElasticModel.predict(X))

In [11]:
GBmodel = GridSearchCV(GradientBoostingRegressor(), cv=3,
              param_grid={"n_estimators": [50, 100, 200]}, 
              scoring='neg_mean_absolute_error',
              verbose=2,
              n_jobs=1)

GBmodel = GBmodel.fit(X, Y)
Best_GB = GBmodel.best_estimator_
print("Best Gradient Boosted model")
print(GBmodel.best_params_)
print(-1*GBmodel.best_score_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................................. n_estimators=50, total=28.6min
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 28.6min remaining:    0.0s


[CV] .................................. n_estimators=50, total=28.5min
[CV] n_estimators=50 .................................................
[CV] .................................. n_estimators=50, total=28.4min
[CV] n_estimators=100 ................................................
[CV] ................................. n_estimators=100, total=56.0min
[CV] n_estimators=100 ................................................
[CV] ................................. n_estimators=100, total=59.2min
[CV] n_estimators=100 ................................................
[CV] ................................. n_estimators=100, total=56.5min
[CV] n_estimators=200 ................................................
[CV] ................................ n_estimators=200, total=111.5min
[CV] n_estimators=200 ................................................
[CV] ................................ n_estimators=200, total=113.5min
[CV] n_estimators=200 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 594.4min finished


Best Gradient Boosted model
{'n_estimators': 200}
6.491782656747981


In [None]:
RFmodel = GridSearchCV(RandomForestRegressor(), cv=3,
              param_grid={"n_estimators": [50, 100, 200]}, 
              scoring='neg_mean_absolute_error',
              verbose=2,
              n_jobs=1)

RFmodel = RFmodel.fit(X, Y)
Best_RF = RFmodel.best_estimator_
print("Best Random Forest model")
print(RFmodel.best_params_)
print(-1*RFmodel.best_score_)

In [None]:
KRmodel = GridSearchCV(KernelRidge(), cv=3,
              param_grid={
                  "alpha": np.logspace(-10, 0, 6),
                  "gamma": np.logspace(-12, -9, 4), 
                  "kernel" : ['laplacian', 'rbf']}, 
              scoring='neg_mean_absolute_error',
                       verbose=2,
                       n_jobs=1)

KRmodel = KRmodel.fit(X, Y)
Best_KernelRidge = KRmodel.best_estimator_
print("Best Kernel Ridge model")
print(KRmodel.best_params_)
print(-1*KRmodel.best_score_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] alpha=1e-10, gamma=1e-12, kernel=laplacian ......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
X2 = [ np.log(df.NumRotors + 1), np.log(df.NumMethyl + 1) ]