In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, make_scorer

In [2]:
#create train and test set
def split_mask(dftouse, split_size = 0.7):
    itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=split_size)
    mask=np.ones(dftouse.shape[0], dtype='int')
    mask[itrain]=1
    mask[itest]=0
    mask = (mask==1)
    return mask

In [3]:
def cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func, verbose):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, verbose=verbose, scoring=score_func)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best, gs.best_params_

def do_classify(clf, parameters, X, y, mask=None, score_func='f1', n_folds=5, n_jobs=1, verbose=False):
    if mask is not None:
        mask = split_mask(X)
    Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    clf, best_params_ = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, verbose=verbose, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    train_preds = clf.predict(Xtrain)
    test_preds = clf.predict(Xtest)
    training_accuracy = mean_squared_error(ytrain, train_preds)
    test_accuracy = mean_squared_error(ytest, test_preds)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.4f" % (training_accuracy ** 2)
    print "Accuracy on test data:     %0.4f" % (test_accuracy ** 2)
    #print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest, best_params_

In [4]:
%%time
df_train = pd.read_csv("original_data/train.csv")
X = df_train.drop(['smiles'], axis=1)
X.drop(['gap'], axis = 1, inplace=True)
X_small = X[:10000]

Wall time: 1min 11s


In [5]:
X.head()

Unnamed: 0,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,1,0,0,0,1,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3,1,0,0,0,1,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [6]:
%%time
"""
Import y-values
"""
y = pd.read_csv("original_data/data_gaps.csv", index_col=0)
y_small = y.loc[X_small.index,:]

Wall time: 858 ms


In [7]:
print X_small.shape
print y_small.shape

(10000, 256)
(10000, 1)


In [10]:
clfForest = RandomForestRegressor()
parameters = {
#    "n_estimators" : [100],
    "n_estimators" : [50,100,200,400],
#    "max_depth": [25]
    "max_depth": [50,100,200, None]
}

In [11]:
%%time
clfForest_, _, _, _, _, best_params_ = do_classify(clfForest, parameters, X_small, np.array(y_small.gap), 
                                                   mask=split_mask(X_small), n_folds = 3, n_jobs = 1, 
                                                   verbose=True, score_func='mean_squared_error')

Fitting 3 folds for each of 16 candidates, totalling 48 fits
BEST {'n_estimators': 200, 'max_depth': 100} -0.085361434656 [mean: -0.08562, std: 0.00133, params: {'n_estimators': 50, 'max_depth': 50}, mean: -0.08544, std: 0.00127, params: {'n_estimators': 100, 'max_depth': 50}, mean: -0.08553, std: 0.00147, params: {'n_estimators': 200, 'max_depth': 50}, mean: -0.08558, std: 0.00147, params: {'n_estimators': 400, 'max_depth': 50}, mean: -0.08587, std: 0.00142, params: {'n_estimators': 50, 'max_depth': 100}, mean: -0.08563, std: 0.00128, params: {'n_estimators': 100, 'max_depth': 100}, mean: -0.08536, std: 0.00135, params: {'n_estimators': 200, 'max_depth': 100}, mean: -0.08554, std: 0.00141, params: {'n_estimators': 400, 'max_depth': 100}, mean: -0.08569, std: 0.00153, params: {'n_estimators': 50, 'max_depth': 200}, mean: -0.08550, std: 0.00148, params: {'n_estimators': 100, 'max_depth': 200}, mean: -0.08543, std: 0.00151, params: {'n_estimators': 200, 'max_depth': 200}, mean: -0.08544,

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  3.3min finished


In [12]:
best_params_ = {k:[v] for k,v in best_params_.iteritems()}

In [13]:
%xdel X_small
%xdel y_small

In [14]:
print y.shape
print X.shape

(1000000, 1)
(1000000, 256)


In [None]:
%%time
clfForest, _, _, _, _, _ = do_classify(clfForest, best_params_, X, np.array(y.gap), mask=split_mask(X),
                                                                   n_folds = 3, n_jobs = 1, score_func='mean_squared_error')

In [None]:
%xdel X
%xdel y

In [None]:
test = pd.read_csv("original_data/test.csv").drop(['smiles'], axis=1)

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
preds = clfForest.predict(test)
write_to_file("submissions/RF_3.csv", preds)