In [1]:
import numpy as np
from scipy.spatial.distance import pdist, cdist,squareform
from scipy.stats import pearsonr
import pandas as pd
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import *


In [2]:
# ECFP6
def FPBase64ToNumpy(fps):
    X = []
    for item in fps:
        bv = DataStructs.ExplicitBitVect(4096)
        DataStructs.ExplicitBitVect.FromBase64(bv, item)
        arr = np.zeros( (1,) )
        DataStructs.ConvertToNumpyArray( bv, arr )
        X.append(arr)
    return X

In [3]:
train_ecfp = pd.read_csv("../data/train.csv") 
test_ecfp = pd.read_csv("../data/holdout.csv")
peptide_ecfp = pd.read_csv("../data/peptide.csv")

In [6]:
trainX_ecfp = FPBase64ToNumpy(train_ecfp.ECFP6)
trainY_ecfp = train_ecfp.ConfEntropy
testX_ecfp = FPBase64ToNumpy(test_ecfp.ECFP6)
testY_ecfp = test_ecfp.ConfEntropy
peptideX_ecfp = FPBase64ToNumpy(peptide_ecfp.ECFP6)
peptideY_ecfp = peptide_ecfp.ConfEntropy

In [7]:
Lasso_ecfp = LassoCV(cv=5,alphas=(0.01, 0.05, 0.1,0.5, 1.0, 5.0, 10.0)).fit(trainX_ecfp, trainY_ecfp)
Ridge_ecfp = RidgeCV(scoring='neg_mean_absolute_error',cv=5,alphas=(0.01, 0.05, 0.1,0.5, 1.0, 5.0, 10.0)).fit(trainX_ecfp, trainY_ecfp)

In [13]:
# Prediction
lassotest_ecfp = Lasso_ecfp.predict(testX_ecfp)
lassopeptide_ecfp = Lasso_ecfp.predict(peptideX_ecfp)
ridgetest_ecfp = Ridge_ecfp.predict(testX_ecfp)
ridgepeptide_ecfp = Ridge_ecfp.predict(peptideX_ecfp)

In [16]:
print("LASSO MAE (J/mol K)")
print("ZINC, Peptide")
print(mean_absolute_error(testY_ecfp,lassotest_ecfp),mean_absolute_error(peptideY_ecfp,lassopeptide_ecfp))

print(r"Pearson $R^{2}$")
print("ZINC, Peptide")
print(np.square(pearsonr(testY_ecfp,lassotest_ecfp)[0]), np.square(pearsonr(peptideY_ecfp,lassopeptide_ecfp)[0]))


print("Ridge MAE (J/mol K)")
print("ZINC, Peptide")
print(mean_absolute_error(testY_ecfp,ridgetest_ecfp),mean_absolute_error(peptideY_ecfp, ridgepeptide_ecfp))

print(r"Pearson $R^{2}$")
print("ZINC, Peptide")
print(np.square(pearsonr(testY_ecfp,ridgetest_ecfp)[0]), np.square(pearsonr(peptideY_ecfp, ridgepeptide_ecfp)[0]))

LASSO MAE (J/mol K)
ZINC, Peptide
5.471984243083529 6.763728716926186
Pearson $R^{2}$
ZINC, Peptide
0.6614825006878227 0.5246553127365389
Ridge MAE (J/mol K)
ZINC, Peptide
5.290579668346901 5.826044439460102
Pearson $R^{2}$
ZINC, Peptide
0.6716639568099717 0.4865169887243126


In [17]:
# CDDD
train_cddd =  pd.read_csv("../data/train_cddd.zip",compression='zip')
test_cddd =  pd.read_csv("../data/zinc_test_cddd.zip",compression='zip')
peptide_cddd =  pd.read_csv("../data/peptide_cddd.zip",compression='zip')

trainX_cddd = train_cddd.iloc[:,2:]
trainY_cddd = train_cddd["ConfEntropy"]
testX_cddd = test_cddd.iloc[:,2:]
testY_cddd = test_cddd["ConfEntropy"]
pepX_cddd = peptide_cddd.iloc[:,2:] 
pepY_cddd = peptide_cddd["ConfEntropy"]

In [19]:
# LASSO and Ridge
Lasso_cddd = LassoCV(cv=5,alphas=(0.01, 0.05, 0.1,0.5, 1.0, 5.0, 10.0)).fit(trainX_cddd, trainY_cddd)
Ridge_cddd = RidgeCV(scoring='neg_mean_absolute_error',cv=5,alphas=(0.01, 0.05, 0.1,0.5, 1.0, 5.0, 10.0)).fit(trainX_cddd, trainY_cddd)



In [20]:
# Prediction
lassotest_cddd = Lasso_cddd.predict(testX_cddd)
lassopeptide_cddd = Lasso_cddd.predict(pepX_cddd)
ridgetest_cddd = Ridge_cddd.predict(testX_cddd)
ridgepeptide_cddd = Ridge_cddd.predict(pepX_cddd)

In [21]:
print("LASSO MAE (J/mol K)")
print("ZINC, Peptide")
print(mean_absolute_error(testY_cddd,lassotest_cddd),mean_absolute_error(pepY_cddd,lassopeptide_cddd))

print("Pearson R^{2}")
print("ZINC, Peptide")
print(np.square(pearsonr(testY_cddd,lassotest_cddd)[0]), np.square(pearsonr(pepY_cddd,lassopeptide_cddd)[0]))


print("Ridge MAE (J/mol K)")
print("ZINC, Peptide")
print(mean_absolute_error(testY_cddd,ridgetest_cddd),mean_absolute_error(pepY_cddd, ridgepeptide_cddd))

print("Pearson R^{2}")
print("ZINC, Peptide")
print(np.square(pearsonr(testY_cddd,ridgetest_cddd)[0]), np.square(pearsonr(pepY_cddd, ridgepeptide_cddd)[0]))

LASSO MAE (J/mol K)
ZINC, Peptide
4.304180572976432 4.8450806233197765
Pearson R^{2}
ZINC, Peptide
0.7883647292664123 0.6829538782457725
Ridge MAE (J/mol K)
ZINC, Peptide
4.263715937568957 4.541650530956554
Pearson R^{2}
ZINC, Peptide
0.7914671135756249 0.6693478185949263
