In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, BayesianRidge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train_full = pd.read_csv("./train.csv")
df_test_full = pd.read_csv("./test.csv")

In [3]:
df_test = df_test_full
df_train = df_train_full

In [None]:
# df_test = df_train_full.sample(n=100000)
# df_train = df_train_full.sample(n=100000)

In [None]:
# could also use this instead?
# del df_test
# df_train, df_validation = train_test_split(df_train_full, train_size=.8)
# df_test = df_test_full

In [4]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.98


In [77]:
df_test.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
test_ids = df_test.index

# this shouldn't exist because its what we are trying to predict?
#test_gap = df_test.gap

In [6]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
# df_test = df_test.drop(['Unnamed: 0'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

# see previous cell to see why gap shouldnt exist in this yet
# df_test = df_test.drop(['gap'], axis=1)

In [7]:
df_test = df_test.drop(['Id'], axis=1)

In [8]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)

In [12]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)


In [60]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)
# we use this as a feature to enhance our feature matrix
LR_pred_train = LR.predict(X_train)

In [61]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
# we use this as a feature to enhance our feature matrix
RF_pred_train = RF.predict(X_train)

In [80]:
RF_100 = RandomForestRegressor(n_estimators=100)
RF_100.fit(X_train, Y_train)
RF_pred_100 = RF_100.predict(X_test)
# we use this as a feature to enhance our feature matrix
RF_pred_100_train = RF_100.predict(X_train)

In [63]:
# Bayesian Ridge Regression
BRR = BayesianRidge(compute_score=True)
BRR.fit(X_train, Y_train)
BRR_pred = BRR.predict(X_test)
# we use this as a feature to enhance our feature matrix
BRR_pred_train = BRR.predict(X_train)

In [66]:
# Lasso Regression
L = Lasso(alpha=0.1)
L.fit(X_train, Y_train)
L_pred = L.predict(X_test)
# we use this as a feature to enhance our feature matrix
L_pred_train = L.predict(X_train)

In [44]:
df_all_enh = df_all

In [67]:
# actually add the features to the matrix
df_all_enh['LR_pred'] = pd.DataFrame(np.vstack(LR_pred_train))
df_all_enh['RF_pred'] = pd.DataFrame(np.vstack(RF_pred_train))
df_all_enh['RF_pred_100'] = pd.DataFrame(np.vstack(RF_pred_100_train))
df_all_enh['BRR_pred'] = pd.DataFrame(np.vstack(BRR_pred_train))
df_all_enh['L_pred'] = pd.DataFrame(np.vstack(L_pred_train))

In [68]:
# using meta-belinda methodvals = df_all.values
vals_enh = df_all_enh.values
X_train_enh = vals_enh[:test_idx]
X_test_enh = vals_enh[test_idx:]
print "Train features:", X_train_enh.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test_enh.shape

Train features: (1000000, 261)
Train gap: (1000000,)
Test features: (824230, 261)


In [79]:
LR_enh = LinearRegression()
LR_enh.fit(X_train_enh, Y_train)
LR_pred_enh = LR_enh.predict(X_test_enh)
print cross_val_score(LR_enh, X_train_enh, Y_train, cv=5, scoring="mean_squared_error")

[-0.07393139 -0.07361426 -0.07382306 -0.0738129  -0.07436067]


In [75]:
RF_enh = RandomForestRegressor()
RF_enh.fit(X_train_enh, Y_train)
RF_pred_enh = LR_enh.predict(X_test_enh)
print cross_val_score(RF_enh, X_train_enh, Y_train, cv=5, scoring="mean_squared_error")

[-0.07427352 -0.07394975 -0.07415885 -0.07413632 -0.07470098]


In [76]:
BRR_enh = BayesianRidge(compute_score=True)
BRR_enh.fit(X_train_enh, Y_train)
BRR_pred_enh = LR_enh.predict(X_test_enh)
print cross_val_score(BRR_enh, X_train_enh, Y_train, cv=5, scoring="mean_squared_error")

[-0.0739295  -0.07361448 -0.07382414 -0.07381244 -0.07435854]


In [78]:
L_enh = Lasso(alpha=0.1)
L_enh.fit(X_train_enh, Y_train)
L_pred_enh = LR_enh.predict(X_test_enh)
print cross_val_score(L_enh, X_train_enh, Y_train, cv=5, scoring="mean_squared_error")

[-0.16609225 -0.16525069 -0.1662291  -0.16533672 -0.16605506]


### Comparing apples to apples for ensemble

In [None]:
ensemble_train = df_train.drop(['smiles'], axis=1)
X_train_ensemble = ensemble_train.values
ensemble_train['LR_pred'] = LR.predict(X_train_ensemble)
ensemble_train['RF_pred_100'] = RF_100.predict(X_train_ensemble)
ensemble_train['BRR_pred'] = BRR.predict(X_train_ensemble)
ensemble_train['L_pred'] = L.predict(X_train_ensemble)

In [None]:
# get new X train with the prediction values
X_train_ensemble = ensemble_train.values

In [None]:
ensemble_test = df_test.drop(['smiles'], axis=1)
X_test_ensemble = ensemble_test.values
ensemble_test['LR_pred'] = LR.predict(X_test_ensemble)
ensemble_test['RF_pred_100'] = RF_100.predict(X_test_ensemble)
ensemble_test['BRR_pred'] = BRR.predict(X_test_ensemble)
ensemble_test['L_pred'] = L.predict(X_test_ensemble)
# get new X test with the prediction values
X_test_ensemble = ensemble_test.values

In [None]:
LR_ensemble = LinearRegression()
LR_ensemble.fit(X_train_ensemble, Y_train)
LR_ensemble_pred = LR_ensemble.predict(X_test_ensemble)

RF_ensemble = RandomForestRegressor(n_estimators=100)
RF_ensemble.fit(X_train_ensemble, Y_train)
RF_ensemble_pred = RF_ensemble.predict(X_test_ensemble)

In [None]:
# Bayesian Ridge Regression
BRR_ensemble = BayesianRidge(compute_score=True)
BRR_ensemble.fit(X_train_ensemble, Y_train)
BRR_ensemble_pred = BRR_ensemble.predict(X_test_ensemble)

In [None]:
# Lasso Regression
L_ensemble = Lasso(alpha=0.1)
L_ensemble.fit(X_train_ensemble, Y_train)
L_ensemble_pred = L_ensemble.predict(X_test_ensemble)

### Evaluation of these Algorithms

In [None]:
# baselines Linear
LR_rms = math.sqrt(mean_squared_error(LR_pred, test_gap))
print "LR_rms"
print LR_rms

#baselines Random Forest
RF_rms = math.sqrt(mean_squared_error(RF_pred, test_gap))
print "RF_rms"
print RF_rms

In [None]:
BRR_rms = math.sqrt(mean_squared_error(BRR_pred, test_gap))
print "BRR_rms"
print BRR_rms

In [None]:
# lasso regression
L_rms = math.sqrt(mean_squared_error(L_pred, test_gap))
print "L_rms"
print L_rms

In [None]:
# random forest with 100
RF_100_rms = math.sqrt(mean_squared_error(RF_pred_100, test_gap))
print "RF_100_rms"
print RF_100_rms

In [None]:
# ensemble methodx
# random forest with 100
RF_ensemble_rms = math.sqrt(mean_squared_error(RF_ensemble_pred, test_gap))
print "RF_ensemble_rms"
print RF_ensemble_rms
# linear ensemble
LR_ensemble_rms = math.sqrt(mean_squared_error(LR_ensemble_pred, test_gap))
print "LR_ensemble_rms"
print LR_ensemble_rms
# bayesian ensemble
BRR_ensemble_rms = math.sqrt(mean_squared_error(BRR_ensemble_pred, test_gap))
print "BRR_ensemble_rms"
print BRR_ensemble_rms
# lasso ensemble
L_ensemble_rms = math.sqrt(mean_squared_error(L_ensemble_pred, test_gap))
print "L_ensemble_rms"
print L_ensemble_rms

# Why do linear and BRR outperform RF in this case? Maybe RF had too high variance for this?

In [None]:
params = LR_ensemble.coef_

In [None]:
params

## Ensemble methods 
Now create a new training set with columns as follows: id, linear, random forest with 100, bayesian ridge regression, lasso regression, and all the other features. We also have to create a test set for this.

### Can try adding all the other features back in - THIS IMPLEMENTATION DOESN'T WORK

#### Creating the training set

In [None]:
test_ids

In [None]:
ensemble_training = pd.DataFrame({'linear' : LR_pred,'random_forest': RF_pred_100,
                           'bayesian_ridge' : BRR_pred,
                           'lasso' : L_pred})

In [None]:
ensemble_training.head(5)

In [None]:
ensemble_training = df_test.drop(['smiles'], axis=1)
ensemble_training['linear'] = LR_pred
ensemble_training['random_forest'] = RF_pred_100
ensemble_training['bayesian_ridge'] = BRR_pred
ensemble_training['lasso'] = L_pred

In [None]:
Y_train = test_gap # actual results for this training set

### 260 features

In [None]:
vals = ensemble_training.values
X_train_ensemble = vals[:test_idx]
print "Train features:", X_train_ensemble.shape
print "Train gap:", Y_train.shape

#### Creating the test set
We need to take a new random sample from our original training and come up with predictions

In [None]:
df_test_ensemble = df_train_full.sample(n=100000)
ensemble_test_ids = df_test_ensemble.index
ensemble_test_gap = df_test_ensemble.gap
df_test_ensemble = df_test_ensemble.drop(['gap'], axis=1)

In [None]:
#Drop the 'smiles' column
ensemble_testing = df_test_ensemble.drop(['smiles'], axis=1)
X_test_ensemble = ensemble_testing.values

In [None]:
LR_pred_test = LR.predict(X_test_ensemble)
BRR_pred_test = BRR.predict(X_test_ensemble)
RF_pred_100_test = RF_100.predict(X_test_ensemble)
L_pred_test = L.predict(X_test_ensemble)

In [None]:
ensemble_testing['linear'] = LR_pred_test
ensemble_testing['random_forest'] = RF_pred_100_test
ensemble_testing['bayesian_ridge'] = BRR_pred_test
ensemble_testing['lasso'] = L_pred_test

In [None]:
X_test_ensemble = ensemble_testing.values

In [None]:
len(X_test_ensemble), len(X_train_ensemble)

#### Do random forest again

In [None]:
LR_ensemble = LinearRegression()
LR_ensemble.fit(X_train_ensemble, Y_train)
LR_ensemble_pred = LR_ensemble.predict(X_test_ensemble)

RF_ensemble = RandomForestRegressor(n_estimators=100)
RF_ensemble.fit(X_train_ensemble, Y_train)
RF_ensemble_pred = RF_ensemble.predict(X_test_ensemble)

#### Evaluate

In [None]:
# baselines Linear
LR_rms = math.sqrt(mean_squared_error(LR_pred, ensemble_test_gap))
print "LR_rms"
print LR_rms

#baselines Random Forest
RF_rms = math.sqrt(mean_squared_error(RF_pred, ensemble_test_gap))
print "RF_rms"
print RF_rms

So for some reason this is even worse :( Maybe should keep all the old features as well...

### Save results to CSV

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
write_to_file("sample1.csv", LR_pred)
write_to_file("sample2.csv", RF_pred)
write_to_file("bayesianridge.csv", BRR_pred)
write_to_file("randomforest100trees.csv", RF_pred_100)