In [63]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
import functions
from collections import Counter
import numpy as np

TRAIN_DIR = "train"
TEST_DIR = "test"

call_set = set([])

In [69]:
def predictAccuracy(y1, y2, cutoff = .5):
    return sum([a == (b >= cutoff) for a,b in zip(y1,y2)]) / float(len(y1))

def cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func, verbose):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, verbose=verbose, scoring=score_func)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best, gs.best_params_

def do_classify(clf, parameters, X, y, mask=None, n_folds=5, n_jobs=4, verbose=False, score_func=None):
    if mask is None:
        print "Test"
        mask = split_mask(X)
    X_train, X_cv, y_train, y_cv = X[mask], X[~mask], y[mask], y[~mask]
    clf, best_params_ = cv_optimize(clf, parameters, X_train, y_train, n_jobs=n_jobs, n_folds=n_folds, verbose=verbose, score_func=score_func)
    clf=clf.fit(X_train, y_train)
    train_preds = clf.predict(X_train)
    cv_preds = clf.predict(X_cv)
    training_accuracy = predictAccuracy(y_train, train_preds, .5)
    cv_accuracy = predictAccuracy(y_cv, cv_preds)
    print "Training classes"
    print training_accuracy
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.4f" % training_accuracy, predictAccuracy(y_train, train_preds, .45), predictAccuracy(y_train, train_preds, .55)
    print "Accuracy on crossv data:   %0.4f" % cv_accuracy, predictAccuracy(y_cv, cv_preds, .45), predictAccuracy(y_cv, cv_preds, .55)
    print "########################################################"
    return clf, X_train, y_train, X_cv, y_cv, best_params_

In [6]:
X, y, ids = functions.create_data_matrix(direc=TRAIN_DIR, verbose=True)
#X_test, y_test, test_ids = functions.create_data_matrix(direc=TEST_DIR, verbose=True)

Number of datafiles loaded: 3087


In [7]:
mask = functions.split_mask(X)

In [31]:
clfForest = RandomForestClassifier()
clfForestRegressor = RandomForestRegressor()
parameters = {
#    "n_estimators" : [100],
    "n_estimators" : [100,200,400,800],
#    "max_depth": [25]
    "max_depth": [100,200,400,None]
}

In [26]:
%%time
clfForest, _, _, _, _, best_params = do_classify(clfForest, parameters, X, y, mask=mask, n_folds = 3, n_jobs = 1, verbose=True, score_func='accuracy')

#preds = clfForest.predict(X_test)
#write_to_file("submissions/RandForest.csv", test_ids, preds)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
BEST {'n_estimators': 200, 'max_depth': 200} 0.747105141269 [mean: 0.74572, std: 0.00525, params: {'n_estimators': 100, 'max_depth': 100}, mean: 0.74340, std: 0.00724, params: {'n_estimators': 200, 'max_depth': 100}, mean: 0.74479, std: 0.00466, params: {'n_estimators': 400, 'max_depth': 100}, mean: 0.74618, std: 0.00416, params: {'n_estimators': 800, 'max_depth': 100}, mean: 0.74479, std: 0.00466, params: {'n_estimators': 100, 'max_depth': 200}, mean: 0.74711, std: 0.00244, params: {'n_estimators': 200, 'max_depth': 200}, mean: 0.74572, std: 0.00355, params: {'n_estimators': 400, 'max_depth': 200}, mean: 0.74618, std: 0.00353, params: {'n_estimators': 800, 'max_depth': 200}, mean: 0.74618, std: 0.00426, params: {'n_estimators': 100, 'max_depth': 400}, mean: 0.74433, std: 0.00598, params: {'n_estimators': 200, 'max_depth': 400}, mean: 0.74479, std: 0.00240, params: {'n_estimators': 400, 'max_depth': 400}, mean: 0.74525, std: 

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   50.5s finished


In [27]:
%%time
clfForest_, _, _, _, _, best_params = do_classify(clfForest, {}, X, y, mask=mask, n_folds = 3, n_jobs = 1, verbose=True, score_func='accuracy')

#preds = clfForest.predict(X_test)
#write_to_file("submissions/RandForest.csv", test_ids, preds)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
BEST {} 0.746641963872 [mean: 0.74664, std: 0.00409, params: {}]
Training classes
0.809634089856
############# based on standard predict ################
Accuracy on training data: 0.8096
Accuracy on crossv data:   0.7246
########################################################
Wall time: 2.67 s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s finished


In [28]:
print clfForest.feature_importances_
print clfForest_.feature_importances_

[ 0.47416268  0.52583732]
[ 0.47970245  0.52029755]


In [44]:
Counter(y)

Counter({0: 114,
         1: 50,
         2: 37,
         3: 31,
         4: 41,
         5: 39,
         6: 53,
         7: 41,
         8: 1609,
         9: 21,
         10: 542,
         11: 32,
         12: 376,
         13: 59,
         14: 40})

In [65]:
y_morphed = np.array([0 if y_ == 8 else 1 for y_ in y])

In [91]:
y_lists = []
for y_ in range(15):
    y_lists.append(np.array([1 if y__ == y_ else 0 for y__ in y]))

In [92]:
for lst in y_lists:
    print Counter(lst)

Counter({0: 2971, 1: 114})
Counter({0: 3035, 1: 50})
Counter({0: 3048, 1: 37})
Counter({0: 3054, 1: 31})
Counter({0: 3044, 1: 41})
Counter({0: 3046, 1: 39})
Counter({0: 3032, 1: 53})
Counter({0: 3044, 1: 41})
Counter({1: 1609, 0: 1476})
Counter({0: 3064, 1: 21})
Counter({0: 2543, 1: 542})
Counter({0: 3053, 1: 32})
Counter({0: 2709, 1: 376})
Counter({0: 3026, 1: 59})
Counter({0: 3045, 1: 40})


In [None]:
#DEFINE X_TEST

In [None]:
%%time
preds = []
for lst in y_lists:
    if 
    clfForest_test, _, _, _, _, best_params = do_classify(clfForestRegressor, {}, X, lst, mask=mask, n_folds = 3, n_jobs = 1, verbose=True)
    preds.append(clfForest_test(X_test))

In [None]:
preds_ = np.array(preds)
preds__ = []
for p in preds_:
#for p in zip(*preds_):    
    preds__.append(map_[np.argmax(p)])
preds__ = np.array(preds__)

In [None]:
# We have predictions!!!!