In [23]:
from src.features.build_features import get_roast_classification_dataset
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from keras.preprocessing.text import Tokenizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [2]:
X, y = get_roast_classification_dataset()

In [5]:
# Split into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [9]:
# create tokenizer
t = Tokenizer(lower=True, split=' ')
# Fit tokenizer only on TRAINING data
t.fit_on_texts(X_train)
# convert x_train and x_test to TF-IDF vectors
X_train = t.texts_to_matrix(X_train, mode='tfidf')
X_test = t.texts_to_matrix(X_test, mode='tfidf')

In [16]:
# Fit model, and make predictions
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=32, n_jobs=-1, oob_score=True, random_state=23)
rnd_clf.fit(X_train, y_train)
rnd_clf.oob_score_

0.5316117542297417

## Random Hyperparameter Search

In [20]:
rnd_clf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=21)
param_dist = {"max_depth": [10, 20, 30, 40, 50, 60, 70, 80],
              "n_estimators": [500, 750, 1000, 1250],
              "max_features": [100, 200, 300, 400, 500, 1000, 2000],
              "min_samples_split": [2, 3, 5, 7, 9],
              "bootstrap": [True]}
n_iter_search = 30
random_search = RandomizedSearchCV(rnd_clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, random_state=21,
                                  return_train_score=True)
random_search.fit(X_train, y_train)

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1, oob_score=True,
                                                    random_state=21),
                   n_iter=30,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80],
                                        'max_features': [100, 200, 300, 400,
                                                         500, 1000, 2000],
                                        'min_samples_split': [2, 3, 5, 7, 9],
                                        'n_estimators': [500, 750, 1000, 1250]},
                   random_state=21, return_train_score=True)

In [24]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.555 (std: 0.004)
Parameters: {'n_estimators': 1000, 'min_samples_split': 5, 'max_features': 1000, 'max_depth': 70, 'bootstrap': True}

Model with rank: 2
Mean validation score: 0.554 (std: 0.011)
Parameters: {'n_estimators': 500, 'min_samples_split': 9, 'max_features': 500, 'max_depth': 40, 'bootstrap': True}

Model with rank: 3
Mean validation score: 0.554 (std: 0.007)
Parameters: {'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 300, 'max_depth': 50, 'bootstrap': True}



## Train Best Model and Test

In [25]:
# Final Model
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=40, max_features=300, n_jobs=-1, oob_score=True, random_state=22)
rnd_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
y_pred = rnd_clf.predict(X_test)
print("Accuracy: " + str(accuracy_score(y_test, y_pred)))

Accuracy: 0.5325022261798753
