In [51]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
def accuracy(pred, actual):
    """Returns percentage of correctly classified labels"""
    return sum(pred==actual) / len(actual)

In [3]:
# Import data and split out labels
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Split into labels, names and data
y_train = train['class']
names_train = train['name']
X_train = train.drop(['class', 'name', 'sequence'], axis=1)

y_test = test['class']
names_train = test['name']
X_test = test.drop(['class', 'name', 'sequence'], axis=1)

# Further split train into train and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.20, random_state=0)

In [50]:
# Optimize a random forest model using grid search
rf = RandomForestClassifier()

param_grid = {
    'n_estimators': [20], 
    'max_depth': [10],
    'max_features': [9],
    'n_jobs': [9]
}

grid_rf = GridSearchCV(rf, param_grid, cv=10, verbose=1)
grid_rf.fit(X_train, y_train)
print("#-------- DONE WITH GRID SEARCH.")
best_model = grid_rf.best_estimator_
best_params = grid_rf.best_params_ 
scores = grid_rf.grid_scores_
print(best_params)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
#-------- DONE WITH GRID SEARCH.
{'max_features': 9, 'n_estimators': 20, 'n_jobs': 9, 'max_depth': 10}


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.9s finished


In [46]:
# Calculate cross validation accuracy
rf = RandomForestClassifier(n_estimators=30, criterion='entropy', max_features=9, max_depth=9,
                            min_samples_leaf = 1, bootstrap=True, oob_score=True, n_jobs=9, random_state=0)
print(np.mean(cross_val_score(rf, X_train, y_train, cv=10)))

0.717900943745


In [47]:
# Fit to full training data and table feature importances
rf = rf.fit(X_train, y_train)
importances = rf.feature_importances_
importance = pd.DataFrame(importances, index=X_train.columns, columns=["importance"])
# importance.sort('importance', ascending=0)

In [49]:
# Print train and test accuracy
print("Training Accuracy = %f" % accuracy(rf.predict(X_train), y_train))
print("Test Accuracy = %f" % accuracy(rf.predict(X_test), y_test))

Training Accuracy = 0.844781
Test Accuracy = 0.723377
