In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error

In [2]:
#create train and test set
def split_mask(dftouse, split_size = 0.7):
    itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=split_size)
    mask=np.ones(dftouse.shape[0], dtype='int')
    mask[itrain]=1
    mask[itest]=0
    mask = (mask==1)
    return mask

In [3]:
def cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best, gs.best_params_

def do_classify(clf, parameters, X, y, mask=None, score_func='f1', n_folds=5, n_jobs=1):
    if mask is not None:
        mask = split_mask(X)
    Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    clf, best_params_ = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    train_preds = clf.predict(Xtrain)
    test_preds = clf.predict(Xtest)
    training_accuracy = mean_squared_error(ytrain, train_preds)
    test_accuracy = mean_squared_error(ytest, test_preds)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    #print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest, best_params_

In [4]:
%%time
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train_512.csv", index_col=0)

Wall time: 2min 26s


In [5]:
%%time
df_test = pd.read_csv("test_512.csv", index_col=0)

Wall time: 2min 16s


In [6]:
df_train_gaps = pd.read_csv("data_gaps_.csv", index_col=0)

In [7]:
df_train.head()

Unnamed: 0,smiles,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,1,1,0,0,1,1,1,1,1,...,1,0,1,1,1,1,1,0,1,1
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,0,1,1,1,0,1,0,0,0,...,1,0,1,1,0,1,0,1,1,1
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,1,1,1,1,1
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,1,1,1,0,1,1,0,1,...,1,1,1,1,0,1,1,0,1,1
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,1,1,1,0,0,1,1,1,1,...,1,0,1,0,0,1,0,0,1,1


In [8]:
df_test.head()

Unnamed: 0,smiles,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,1,1,0,0,1,0,1,1,1,...,1,1,1,1,0,1,0,0,1,1
1,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,1,1,0,0,1,1,1,1,1,...,1,0,1,1,0,1,0,0,1,1
2,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,1,1,1,1,1,1,1,0,...,1,0,1,1,1,1,1,0,1,1
3,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,0,1,1,1,0,1,1,0,0,...,1,0,1,1,0,1,1,0,0,1
4,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,1,0,0,0,1,1,1,1,1,...,1,0,1,1,1,1,1,0,1,1


In [9]:
df_train_gaps.head()

Unnamed: 0,gap
0,1.19
1,1.6
2,1.49
3,1.36
4,1.98


In [10]:
print df_train.shape
print df_train_gaps.shape
print df_test.shape

(1000000, 513)
(1000000, 1)
(824230, 513)


In [11]:
df_train = df_train.drop(['smiles'], axis=1)
X_test = df_test.drop(['smiles'], axis=1)

In [13]:
mask_ = split_mask(X_train, split_size=.1)
X = df_train[mask_]
y = df_train_gaps[mask_]

In [18]:
print y.shape
print X.shape
print X_test.shape

(100000, 1)
(100000, 512)
(824230, 512)


In [27]:
clfForest = RandomForestRegressor()
parameters = {
    "n_estimators" : [400,800,1600],
    #"n_estimators" : range(1, 21),
    "max_depth": [25,50,100]
}
mask=split_mask(X)

In [28]:
%%time
clfForest, Xtrain, ytrain, Xtest, ytest, best_params = do_classify(clfForest, parameters, X, np.array(y.gap), mask=mask, n_folds = 3, n_jobs = 1, score_func='mean_squared_error')

ERROR: Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt


Traceback (most recent call last):
  File "c:\Anaconda\lib\site-packages\IPython\core\ultratb.py", line 970, in get_records


In [26]:
clfForest1 = clfForest
best_params1 = best_params

In [29]:
%%time
preds = clfForest.predict(X_test)

Exception KeyboardInterrupt in 'zmq.backend.cython.message.Frame.__dealloc__' ignored


KeyboardInterrupt: 

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
write_to_file("RF_2.csv", preds)