In [4]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [5]:
def accuracy(pred, actual):
    """Returns percentage of correctly classified labels"""
    return sum(pred==actual) / len(actual)

In [6]:
# Import data and split out labels
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Split into labels, names and data
y_train = train['class']
names_train = train['name']
X_train = train.drop(['class', 'name', 'sequence'], axis=1)

y_test = test['class']
names_train = test['name']
X_test = test.drop(['class', 'name', 'sequence'], axis=1)

# Further split train into train and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.20, random_state=0)

In [19]:
# Optimize a random forest model using grid search
rf = RandomForestClassifier()

param_grid = {
    'n_estimators': [90, 100, 150, 200], 
    'max_depth': [22, 25, 30],
    'max_features': [9],
    'n_jobs': [30],
    'min_samples_leaf': [1]
}

grid_rf = GridSearchCV(rf, param_grid, cv=2, verbose=3)
grid_rf.fit(X_train, y_train)
print("#-------- DONE WITH GRID SEARCH.")
best_model = grid_rf.best_estimator_
best_params = grid_rf.best_params_ 
scores = grid_rf.grid_scores_
print(best_params)

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] max_features=9, n_estimators=90, n_jobs=30, max_depth=22, min_samples_leaf=1 
[CV]  max_features=9, n_estimators=90, n_jobs=30, max_depth=22, min_samples_leaf=1, score=0.743385, total=   0.8s
[CV] max_features=9, n_estimators=90, n_jobs=30, max_depth=22, min_samples_leaf=1 
[CV]  max_features=9, n_estimators=90, n_jobs=30, max_depth=22, min_samples_leaf=1, score=0.738235, total=   0.8s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s



[CV] max_features=9, n_estimators=100, n_jobs=30, max_depth=22, min_samples_leaf=1 
[CV]  max_features=9, n_estimators=100, n_jobs=30, max_depth=22, min_samples_leaf=1, score=0.739185, total=   0.9s
[CV] max_features=9, n_estimators=100, n_jobs=30, max_depth=22, min_samples_leaf=1 
[CV]  max_features=9, n_estimators=100, n_jobs=30, max_depth=22, min_samples_leaf=1, score=0.744958, total=   0.8s
[CV] max_features=9, n_estimators=150, n_jobs=30, max_depth=22, min_samples_leaf=1 
[CV]  max_features=9, n_estimators=150, n_jobs=30, max_depth=22, min_samples_leaf=1, score=0.747585, total=   1.2s
[CV] max_features=9, n_estimators=150, n_jobs=30, max_depth=22, min_samples_leaf=1 
[CV]  max_features=9, n_estimators=150, n_jobs=30, max_depth=22, min_samples_leaf=1, score=0.739916, total=   1.4s
[CV] max_features=9, n_estimators=200, n_jobs=30, max_depth=22, min_samples_leaf=1 
[CV]  max_features=9, n_estimators=200, n_jobs=30, max_depth=22, min_samples_leaf=1, score=0.749265, total=   1.5s
[CV]

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   32.8s finished



{'max_features': 9, 'n_estimators': 200, 'n_jobs': 30, 'max_depth': 22, 'min_samples_leaf': 1}


In [21]:
# Calculate cross validation accuracy
rf = RandomForestClassifier(n_estimators=500, criterion='entropy', max_features=9, max_depth=22,
                            min_samples_leaf = 1, bootstrap=True, oob_score=True, n_jobs=30, random_state=0)
print(np.mean(cross_val_score(rf, X_train, y_train, cv=5)))

0.757614609441


In [22]:
# Fit to full training data and table feature importances
rf = rf.fit(X_train, y_train)
importances = rf.feature_importances_
importance = pd.DataFrame(importances, index=X_train.columns, columns=["importance"])
# importance.sort('importance', ascending=0)

In [23]:
# Print train and test accuracy
print("Training Accuracy = %f" % accuracy(rf.predict(X_train), y_train))
print("Test Accuracy = %f" % accuracy(rf.predict(X_test), y_test))

Training Accuracy = 1.000000
Test Accuracy = 0.746062
