In [1]:
from importCSV import sampleFiles

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, make_scorer

In [3]:
#create train and test set
def split_mask(dftouse, split_size = 0.7):
    itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=split_size)
    mask=np.ones(dftouse.shape[0], dtype='int')
    mask[itrain]=1
    mask[itest]=0
    mask = (mask==1)
    return mask

In [4]:
def cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func, verbose):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, verbose=verbose, scoring=score_func)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best, gs.best_params_

def do_classify(clf, parameters, X, y, mask=None, score_func='f1', n_folds=5, n_jobs=1, verbose=False):
    if mask is not None:
        mask = split_mask(X)
    Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    clf, best_params_ = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, verbose=verbose, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    train_preds = clf.predict(Xtrain)
    test_preds = clf.predict(Xtest)
    training_accuracy = mean_squared_error(ytrain, train_preds)
    test_accuracy = mean_squared_error(ytest, test_preds)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.4f" % (training_accuracy ** 2)
    print "Accuracy on test data:     %0.4f" % (test_accuracy ** 2)
    #print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest, best_params_

In [5]:
%%time
"""
Import small dataset
"""
df_train_small = sampleFiles(2)
X_small = df_train_small.drop(['smiles'], axis=1)

Wall time: 753 ms


In [6]:
%%time
"""
Import y-values
"""
df_train_gaps = pd.read_csv("original_data/data_gaps.csv", index_col=0)
y_small = df_train_gaps.loc[X_small.index,:]

Wall time: 508 ms


In [7]:
print X_small.shape
print y_small.shape

(2000, 1024)
(2000, 1)


In [8]:
clfForest = RandomForestRegressor()
parameters = {
    "n_estimators" : [100],
#    "n_estimators" : [100,200,400,800,1600],
    #"n_estimators" : range(1, 21),
    "max_depth": [25]
#    "max_depth": [25,50,100,200,400]
}

In [10]:
%%time
clfForest_, _, _, _, _, best_params_ = do_classify(clfForest, parameters, X_small, np.array(y_small.gap), 
                                                   mask=split_mask(X_small), n_folds = 3, n_jobs = 1, 
                                                   verbose=True, score_func='mean_squared_error')

Fitting 3 folds for each of 1 candidates, totalling 3 fits
BEST {'n_estimators': 100, 'max_depth': 25} -0.183569773247 [mean: -0.18357, std: 0.01425, params: {'n_estimators': 100, 'max_depth': 25}]
############# based on standard predict ################
Accuracy on training data: 0.0007
Accuracy on test data:     0.0267
########################################################
Wall time: 2min 23s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.1min finished


In [11]:
best_params_ = {k:[v] for k,v in best_params_.iteritems()}

In [12]:
%xdel df_train_small
%xdel X_small
%xdel y_small

In [29]:
%%time
"""
Import full dataset
"""

df_train, df_cv = sampleFiles(10,cv=250)
X = df_train.drop(['smiles'], axis=1)
X_cv = df_cv.drop(['smiles'], axis=1)

y = df_train_gaps.loc[X.index,:]
y_cv = df_train_gaps.loc[X_cv.index,:]

Wall time: 3min 5s


In [30]:
print y.shape
print X.shape
print y_cv.shape
print X_cv.shape

(10000, 1)
(10000, 1024)
(250000, 1)
(250000, 1024)


In [15]:
%%time
clfForest, Xtrain, ytrain, Xtest, ytest, best_params = do_classify(clfForest, best_params_, X, np.array(y.gap), mask=split_mask(X), n_folds = 3, n_jobs = 1, score_func='mean_squared_error')

BEST {'n_estimators': 100, 'max_depth': 25} -0.169316827575 [mean: -0.16932, std: 0.00196, params: {'n_estimators': 100, 'max_depth': 25}]
############# based on standard predict ################
Accuracy on training data: 0.0028
Accuracy on test data:     0.0295
########################################################
Wall time: 14min 5s


In [31]:
%%time
preds = clfForest.predict(X_cv)
print mean_squared_error(preds, np.array(y_cv)) ** 2

0.0292235875358
Wall time: 10.7 s


In [32]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [44]:
def getTest(n = 50):
    for n_ in range(824):
        if not (n_ % n):
            print (n_+1)/float(824)
        data = pd.read_csv('edited_data/1024_features/test_1024_'+str(1000*(n_+1))+'.csv', index_col=0)
        try:
            data_final = pd.concat([data_final, data])
        except UnboundLocalError:
            data_final = data
    return data_final

In [None]:
preds = clfForest.predict(getTest())
write_to_file("RF_2.csv", preds)