In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, make_scorer

In [2]:
#create train and test set
def split_mask(dftouse, split_size = 0.7):
    itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=split_size)
    mask=np.ones(dftouse.shape[0], dtype='int')
    mask[itrain]=1
    mask[itest]=0
    mask = (mask==1)
    return mask

In [3]:
def cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best, gs.best_params_

def do_classify(clf, parameters, X, y, mask=None, score_func='f1', n_folds=5, n_jobs=1):
    if mask is not None:
        mask = split_mask(X)
    Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    clf, best_params_ = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    train_preds = clf.predict(Xtrain)
    test_preds = clf.predict(Xtest)
    training_accuracy = mean_squared_error(ytrain, train_preds)
    test_accuracy = mean_squared_error(ytest, test_preds)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.4f" % (training_accuracy ** 2)
    print "Accuracy on test data:     %0.4f" % (test_accuracy ** 2)
    #print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest, best_params_

In [4]:
%%time
"""
Import small dataset
"""
df_train_small = pd.read_csv("edited_data/512_features/traindata_small.csv", index_col=0)
X_small = df_train_small.drop(['smiles'], axis=1)

Wall time: 124 ms


In [5]:
%%time
"""
Import y-values
"""
df_train_gaps = pd.read_csv("original_data/data_gaps.csv", index_col=0)
y_small = df_train_gaps.loc[X_small.index,:]

Wall time: 523 ms


In [6]:
print X_small.shape
print y_small.shape

(1000, 512)
(1000, 1)


In [7]:
clfForest = RandomForestRegressor()
parameters = {
    "n_estimators" : [100],
#    "n_estimators" : [100,200,400,800,1600],
    #"n_estimators" : range(1, 21),
    "max_depth": [25]
#    "max_depth": [25,50,100,200,400]
}

In [8]:
%%time
clfForest_, _, _, _, _, best_params_ = do_classify(clfForest, parameters, X_small, np.array(y_small.gap), mask=split_mask(X_small), n_folds = 3, n_jobs = 1, score_func='mean_squared_error')

BEST {'n_estimators': 100, 'max_depth': 25} -0.0496315232429 [mean: -0.04963, std: 0.00081, params: {'n_estimators': 100, 'max_depth': 25}]
############# based on standard predict ################
Accuracy on training data: 0.0065
Accuracy on test data:     0.0420
########################################################
Wall time: 15 s


In [9]:
best_params_ = {k:[v] for k,v in best_params_.iteritems()}

In [10]:
%xdel df_train_small
%xdel X_small
%xdel y_small

In [12]:
%%time
"""
Import full dataset
"""

df_train = pd.read_csv("edited_data/512_features/traindata.csv", index_col=0)
X_ = df_train.drop(['smiles'], axis=1)

Wall time: 2min 13s


In [14]:
mask_ = split_mask(df_train, split_size=.0001)

X = [mask_]
y = df_train_gaps[mask_]
X_cv = [~mask_]
y_cv = df_train_gaps[~mask_]

In [15]:
print y.shape
print X.shape
print y_cv.shape
print X_cv.shape

(100, 1)
(100, 513)
(999900, 1)
(999900, 513)


In [16]:
%%time
clfForest, Xtrain, ytrain, Xtest, ytest, best_params = do_classify(clfForest, best_params_, X, np.array(y.gap), mask=split_mask(X), n_folds = 3, n_jobs = 1, score_func='mean_squared_error')

ValueError: could not convert string to float: c1ccc(o1)C1=Cc2cnc3c([se]c4ccc5cocc5c34)c2C1

In [None]:
%%time
preds = clfForest.predict(X_test)

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
write_to_file("RF_2.csv", preds)