In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from rdkit import Chem
from rdkit.Chem import AllChem
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA

In [3]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [4]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.98


In [5]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [30]:
NA_train = []
NA_test = []
GNHA_train = []
GNHA_test = []
NB_train = []
NB_test = []

for smile in df_train['smiles']:
    mol = Chem.MolFromSmiles(smile)
    NA_train.append(mol.GetNumAtoms())
    GNHA_train.append(mol.GetNumHeavyAtoms())
    NB_train.append(mol.GetNumBonds())
    
    
for smile in df_test['smiles']:
    mol = Chem.MolFromSmiles(smile)
    NA_test.append(mol.GetNumAtoms())
    GNHA_test.append(mol.GetNumHeavyAtoms())
    NB_test.append(mol.GetNumBonds())

In [31]:
df_train['NumAtms'] = NA_train
df_train['HvyAtms'] = GNHA_train
df_train['NumBnds'] = NB_train
df_test['NumAtms'] = NA_test
df_test['HvyAtms'] = GNHA_test
df_test['NumBnds'] = NB_test

In [33]:
train_smiles = train_x['smiles']
test_smiles = test_x['smiles']

In [36]:
df_train = df_train.drop(['smiles'], axis=1)
df_test = df_test.drop(['smiles'], axis=1)

In [77]:
x_train, x_test, y_train, y_test = train_test_split(df_train, Y_train, test_size = .33)

In [78]:
Lasso = LassoCV()
Lasso.fit(x_train, y_train)
Lasso_pred = Lasso.predict(x_test)

Lasso_error = mean_squared_error(y_test, Lasso_pred)

In [79]:
Ridge = RidgeCV()
Ridge.fit(x_train, y_train)
Ridge_pred = Ridge.predict(x_test)

Ridge_error = mean_squared_error(y_test, Ridge_pred)

In [80]:
l1_rtio = Lasso_error / (Lasso_error + Ridge_error)

EN = ElasticNetCV(l1_ratio = [l1_rtio, .1, .9, .95, .99, 1])
EN.fit(x_train, y_train)
EN_pred = EN.predict(x_test)

EN_error = mean_squared_error(y_test, EN_pred)

In [None]:
Ada = AdaBoostRegressor(DecisionTreeRegressor(), learning_rate=0.05)
params = {'base_estimator__max_depth':list(range(1,6))}
ada_cv = GridSearchCV(Ada, params, cv = 5)
ada_cv.fit(x_train, y_train)

In [57]:
# print Lasso_error
# print Ridge_error
# print EN_error
print cross_val_score(LassoCV(), df_train, Y_train)
print cross_val_score(RidgeCV(), df_train, Y_train)
print cross_val_score(ElasticNetCV(l1_ratio = l1_rtio), df_train, Y_train)

[ 0.5429816   0.54284339  0.54041939]
[ 0.54604865  0.54571122  0.54324111]
[ 0.54253733  0.54239732  0.54003766]


In [59]:
cross_val_score(LinearRegression(), df_train, Y_train)

array([ 0.54604906,  0.54571092,  0.54323894])

In [82]:
LR = LinearRegression()
LR.fit(x_train, y_train)
LR_pred = LR.predict(x_test)

LR_error = mean_squared_error(y_test, LR_pred)

In [83]:
print math.sqrt(LR_error)
print math.sqrt(Ridge_error)
print math.sqrt(Lasso_error)
print math.sqrt(EN_error)
print math.sqrt(Ada_error)

0.274973640215
0.274973000858
0.275906454477
0.275906454477
0.313597671048


In [7]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [9]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)


In [10]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

In [11]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

In [12]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
write_to_file("sample1.csv", LR_pred)
write_to_file("sample2.csv", RF_pred)

In [13]:
print LR_pred
print RF_pred

[ 1.61237192  1.65141153  1.53785944 ...,  1.61237192  1.77372119
  1.7494111 ]
[ 1.46531041  1.65460187  1.54674697 ...,  1.46531041  1.50772525
  1.82709096]


In [14]:
from rdkit import Chem