In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn import cross_validation
from sklearn.decomposition import PCA

#from rdkit import Chem

In [2]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [3]:
def RMSE(predictions,labels):
    sum = 0
    for i in range(len(labels)):
        sum += (predictions[i]-labels[i])**2
    return (sum/len(labels))**0.5

In [None]:
def best_LR_predictor(train_data, train_labels, test_data, test_labels, exclist = []):
    min_RMSE = None
    best_predictor = None
    feature_num = train_data.shape[1]
    train_existing_predictors = train_data[:, exclist]
    train_existing_predictors = pd.DataFrame(train_existing_predictors)
    test_existing_predictors= test_data[:, exclist]
    test_existing_predictors = pd.DataFrame(test_existing_predictors)
    for i in range(feature_num) :
        if i not in exclist:
            feature_i = train_data[:,np.newaxis, i]
            feature_i = pd.DataFrame(feature_i)
            predictors = pd.concat((train_existing_predictors, feature_i), axis = 1)
            LR1 = LinearRegression()            
            LR1.fit(predictors, train_labels)
            test_i = test_data[:, np.newaxis, i]
            test_i = pd.DataFrame(test_i)
            test_predictors = pd.concat((test_existing_predictors, test_i), axis = 1)
            test_predict = LR1.predict(test_predictors)
            RMSE_score = RMSE(test_predict, test_labels)
            if ((min_RMSE == None) or (RMSE_score < min_RMSE)):
                min_RMSE = RMSE_score
                best_predictor = i
    return best_predictor, min_RMSE

In [None]:
def best_predictor(train_data, train_labels, test_data, test_labels, model, exclist = []):
    min_RMSE = None
    best_predictor = None
    feature_num = train_data.shape[1]
    train_existing_predictors = train_data[:, exclist]
    train_existing_predictors = pd.DataFrame(train_existing_predictors)
    test_existing_predictors= test_data[:, exclist]
    test_existing_predictors = pd.DataFrame(test_existing_predictors)
    for i in range(feature_num) :
        if i not in exclist:
            feature_i = train_data[:,np.newaxis, i]
            feature_i = pd.DataFrame(feature_i)
            predictors = pd.concat((train_existing_predictors, feature_i), axis = 1)
            Model = model()            
            Model.fit(predictors, train_labels)
            test_i = test_data[:, np.newaxis, i]
            test_i = pd.DataFrame(test_i)
            test_predictors = pd.concat((test_existing_predictors, test_i), axis = 1)
            test_predict = Model.predict(test_predictors)
            RMSE_score = RMSE(test_predict, test_labels)
            if ((min_RMSE == None) or (RMSE_score < min_RMSE)):
                min_RMSE = RMSE_score
                best_predictor = i
    return best_predictor, min_RMSE

In [None]:
def select_predictor(train_data, train_labels, test_data, test_labels, model):
    last_RMSE = None
    mylist = []
    existing_predictors = []
    while(True):
        new_predictor, cur_RMSE = best_predictor(train_data, train_labels, test_data, test_labels, model, mylist)
        if ((last_RMSE == None) or (cur_RMSE<last_RMSE)):
            mylist.append(new_predictor)
            last_RMSE = cur_RMSE
        else:
            break
    return mylist, last_RMSE

In [4]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")

In [6]:
df_train1 = pd.read_csv("newfeature\SimilarityV1.csv")


In [7]:
df_train2 = pd.read_csv("newfeature\Bag1.csv")


In [8]:
df_train3 = pd.read_csv("newfeature\Bag2.csv")


In [9]:
df_train4 = pd.read_csv("newfeature\Bag3.csv")

In [10]:
df_train5 = pd.read_csv("newfeature\Bag4.csv")

In [11]:
df_train6 = pd.read_csv("newfeature\Bag5.csv")

In [12]:
df_test = pd.read_csv("test.csv")

In [13]:
print df_train.head()

                                              smiles  feat_001  feat_002  \
0  c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...         0         0   
1  C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...         1         0   
2  [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...         1         0   
3  [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...         1         0   
4     c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1         0         0   

   feat_003  feat_004  feat_005  feat_006  feat_007  feat_008  feat_009  ...   \
0         0         0         1         0         1         0         0  ...    
1         0         0         1         0         1         0         0  ...    
2         0         0         1         1         1         0         0  ...    
3         0         0         1         1         1         0         0  ...    
4         0         0         1         0         1         0         0  ...    

   feat_248  feat_249  feat_250  feat_251  feat_252  fea

In [14]:
print df_train1.head()

   Unnamed: 0                                             smiles      sim1  \
0           0  c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...  0.485384   
1           1  C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...  0.441337   
2           2  [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...  0.514501   
3           3  [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...  0.480871   
4           4     c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1  0.437585   

       sim2      sim3      sim4      sim5      sim6      sim7      sim8  \
0  0.556582  0.466346  0.536730  0.605935  0.596811  0.494630  0.441875   
1  0.514697  0.437576  0.538368  0.482856  0.526786  0.430164  0.394619   
2  0.574959  0.471549  0.560948  0.621408  0.613008  0.493862  0.425959   
3  0.548297  0.465929  0.573954  0.537569  0.588203  0.480505  0.428044   
4  0.460356  0.432432  0.562887  0.422779  0.448397  0.412411  0.453927   

     ...        sim24     sim25     sim26     sim27     sim28     sim29  \
0    

In [None]:
print df_train2.head()

In [None]:
print df_train3.head()

In [1]:
print df_train4.head()

NameError: name 'df_train' is not defined

In [None]:
print df_train5.head()

In [None]:
print df_train6.head()

In [15]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [14]:
print test_idx
write_to_file("Y_train.csv", Y_train)

1000000


In [16]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all = df_all.reset_index()

In [17]:
df_all = df_all.drop(['smiles'], axis = 1)

In [18]:
#with new features
df_all1 = df_train1.drop(['smiles'], axis=1)

In [19]:
df_all = pd.concat((df_all, df_all1), axis=1)

In [20]:
df_all2  = df_train2.drop(['smiles'], axis=1)

In [21]:
df_all = pd.concat((df_all, df_all2), axis = 1)

In [22]:
df_all3  = df_train3.drop(['smiles'], axis=1)

In [23]:
df_all = pd.concat((df_all, df_all3), axis = 1)

In [24]:
df_all4 = df_train4.drop(['smiles'], axis = 1)

In [25]:
df_all = pd.concat((df_all, df_all4), axis = 1)

In [26]:
df_all5 = df_train5.drop(['smiles'], axis = 1)

In [27]:
df_all = pd.concat((df_all, df_all5), axis = 1)

In [28]:
print df_all.shape
print df_all.head()

(1824230, 330)
   index  feat_001  feat_002  feat_003  feat_004  feat_005  feat_006  \
0      0         0         0         0         0         1         0   
1      1         1         0         0         0         1         0   
2      2         1         0         0         0         1         1   
3      3         1         0         0         0         1         1   
4      4         0         0         0         0         1         0   

   feat_007  feat_008  feat_009   ...    NumAliphaticCarbocycles  \
0         1         0         0   ...                          0   
1         1         0         0   ...                          1   
2         1         0         0   ...                          1   
3         1         0         0   ...                          1   
4         1         0         0   ...                          0   

   NumAliphaticHeterocycles  NumAliphaticRings  NumAmideBonds  \
0                         0                  0              0   
1            

In [31]:
#write to file
#df_all.to_csv("df_all.csv")
#df_all =pd.read_csv(df_all.csv)

In [29]:
X_train = df_all.iloc[:test_idx]
X_test = df_all.iloc[test_idx:]

In [35]:
X_mytrain, X_mytest, Y_mytrain, Y_mytest = cross_validation.train_test_split(X_train, Y_train, test_size=0.1, random_state=1)
#cross_validation. 60% of training data are used as "mytrain", 40% of test data
#are used as "mytest"

In [39]:
LR = LinearRegression()
LR.fit(X_mytrain, Y_mytrain)
LR_pred = LR.predict(X_mytest)
RMSE(LR_pred, Y_mytest)

0.17965293353162534

In [36]:
RF = RandomForestRegressor()
RF.fit(X_mytrain, Y_mytrain)
RF_pred = RF.predict(X_mytest)
RMSE(RF_pred, Y_mytest)

0.12157428367051742

In [37]:
write_to_file("RF_predict.csv", RF_pred)

In [43]:
L2 = Ridge(alpha=2.0)
L2.fit(X_mytrain, Y_mytrain) 
L2_pred = L2.predict(X_mytest)
RMSE(L2_pred, Y_mytest)

0.17963218561737324

In [46]:
print X_mytrain.shape

(1000, 330)


In [47]:
n_components = 10
pca = PCA().fit(X_train)
eigen = pca.components_ #(n_components, n_var)
X_train_pca = pca.transform(X_mytrain)
X_test_pca = pca.transform(X_mytest)
PCA_pred = RF.predict(X_test_pca)
RMSE(PCA_pred, Y_mytest)

MemoryError: 

In [None]:
print type(df_all), type(df_train1)
print df_all.shape, df_train1.shape
print df_all.index.values, df_train1.index.values

df_all1 = pd.concat([df_all, df_train1], axis = 1)

In [None]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print ("Train features:", X_train.shape)
print ("Train gap:", Y_train.shape)
print ("Test features:", X_test.shape)

In [None]:
X_mytrain, X_mytest, Y_mytrain, Y_mytest = cross_validation.train_test_split(X_train, Y_train, test_size=0.4, random_state=0)
#cross_validation. 60% of training data are used as "mytrain", 40% of test data
#are used as "mytest"

In [None]:
LR = LinearRegression()
LR.fit(X_mytrain, Y_mytrain)
LR_pred = LR.predict(X_mytest)

RMSE(LR_pred, Y_mytest)

In [None]:
RF = RandomForestRegressor()
RF.fit(X_mytrain, Y_mytrain)
RF_pred = RF.predict(X_mytest)

In [None]:
RMSE(RF_pred, Y_mytest)

In [None]:
L2 = Ridge(alpha=1.0)
L2.fit(X_train, Y_train) 
L2_pred = L2.predict(X_mytest)

In [None]:
RMSE(L2_pred, Y_mytest)

In [None]:
print select_predictor(X_mytrain, Y_mytrain, X_mytest,Y_mytest, LinearRegression)
print select_predictor(X_mytrain, Y_mytrain, X_mytest,Y_mytest, RandomForestRegressor)
print select_predictor(X_mytrain, Y_mytrain, X_mytest,Y_mytest, Ridge)

In [None]:
write_to_file("sample1.csv", LR_pred)
write_to_file("sample2.csv", RF_pred)

In [None]:
type(df_train)