In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [2]:
def accuracy(pred, actual):
    """Returns percentage of correctly classified labels"""
    return sum(pred==actual) / len(actual)

In [4]:
# Import data and split out labels
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Split into labels, names and data
y_train = train['class']
names_train = train['name']
X_train = train.drop(['class', 'name', 'sequence'], axis=1)

y_test = test['class']
names_train = test['name']
X_test = test.drop(['class', 'name', 'sequence'], axis=1)

In [10]:
# Optimize a random forest model using grid search
rf = RandomForestClassifier()

param_grid = {
    'n_estimators': [500], 
    'max_depth': [28, 30, 32],
    'max_features': [5, 7, 9],
    'n_jobs': [30],
    'min_samples_leaf': [1, 2, 3]
}

grid_rf = GridSearchCV(rf, param_grid, cv=2, verbose=3)
grid_rf.fit(X_train, y_train)
print("#-------- DONE WITH GRID SEARCH.")
best_model = grid_rf.best_estimator_
best_params = grid_rf.best_params_ 
scores = grid_rf.grid_scores_
print(best_params)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV] max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=1 
[CV]  max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=1, score=0.750525, total=   3.4s
[CV] max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=1 
[CV]  max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=1, score=0.739496, total=   3.0s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.6s remaining:    0.0s



[CV] max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=2 
[CV]  max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=2, score=0.746745, total=   3.2s
[CV] max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=2 
[CV]  max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=2, score=0.742857, total=   3.6s
[CV] max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=3 
[CV]  max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=3, score=0.742965, total=   3.1s
[CV] max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=3 
[CV]  max_features=5, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=3, score=0.739076, total=   3.0s
[CV] max_features=7, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=1 
[CV]  max_features=7, n_estimators=500, n_jobs=30, max_depth=28, min_samples_leaf=1, score=0.746325, total=   3.4s
[CV]

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  3.7min finished



{'max_features': 5, 'n_estimators': 500, 'n_jobs': 30, 'max_depth': 32, 'min_samples_leaf': 2}


In [11]:
# Calculate cross validation accuracy
rf = RandomForestClassifier(n_estimators=500, criterion='entropy', max_features=5, max_depth=32,
                            min_samples_leaf = 2, bootstrap=True, oob_score=True, n_jobs=30, random_state=0)
print(np.mean(cross_val_score(rf, X_train, y_train, cv=5)))

0.756353227163


In [7]:
# Fit to full training data and table feature importances
rf = rf.fit(X_train, y_train)
importances = rf.feature_importances_
importance = pd.DataFrame(importances, index=X_train.columns, columns=["importance"])
# importance.sort('importance', ascending=0)

In [8]:
# Print train and test accuracy
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
print("Training Accuracy = %f" % accuracy(y_train_pred, y_train))
print("Test Accuracy = %f" % accuracy(rf.predict(X_test), y_test))

Training Accuracy = 1.000000
Test Accuracy = 0.745432


In [9]:
confusion_matrix(np.array(y_test), np.array(y_test_pred))

array([[687,  36,   0,  32],
       [ 99, 208,   0,  22],
       [106,   5,   0,   3],
       [ 84,  17,   0, 288]])