In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [20]:
comments = pd.read_csv('./final_dataset_cleaned_fourplus.csv')

### Random Forest

In [21]:
X = comments['body']
y = comments['subreddit']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle=True,stratify=y)

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [28]:
pipe_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('model', RandomForestClassifier())
     ])

params = {
    'vect__ngram_range':[(1,3)],
    'vect__min_df':[2,5],
    'vect__stop_words':[None,'english'],
    'model__max_depth':[50,100,150],
    'model__n_estimators':[100,200],
}

gs_rf2 = GridSearchCV(pipe_rf, params, cv=4, verbose=3, n_jobs=-1)

gs_rf2.fit(X_train, y_train)

Fitting 4 folds for each of 24 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 113.9min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 3)], 'vect__min_df': [2, 5], 'vect__stop_words': [None, 'english'], 'model__max_depth': [50, 100, 150], 'model__n_estimators': [100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [29]:
gs_rf2.best_estimator_.steps

[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=5,
          ngram_range=(1, 3), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('model',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=150, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False))]

In [30]:
gs_rf2.score(X_train,y_train)

0.9637731366920717

In [31]:
gs_rf2.score(X_test,y_test)

0.6737929805807946

Overall, this model appears to perform slightly less well than the logistic regression, which was able to exceed an accuracy score of 70% at times.  One last thing I wanted to try was granularizing the n_estimators parameter while leaving the other best estimator parameters the same.  This should be a relatively quick GridSearch, and may improve the score.

In [32]:
pipe_rfg = Pipeline([
    ('vect', CountVectorizer()),
    ('model', RandomForestClassifier())
     ])

params = {
    'vect__ngram_range':[(1,3)],
    'vect__min_df':[5],
    'vect__stop_words':[None],
    'model__max_depth':[150],
    'model__n_estimators':[175,200,250],
}

gs_rf3 = GridSearchCV(pipe_rfg, params, cv=4, verbose=3, n_jobs=-1)

gs_rf3.fit(X_train, y_train)

Fitting 4 folds for each of 3 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed: 20.6min remaining: 103.2min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed: 21.6min remaining: 15.4min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 42.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 42.9min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 3)], 'vect__min_df': [5], 'vect__stop_words': [None], 'model__max_depth': [150], 'model__n_estimators': [175, 200, 250]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [33]:
gs_rf3.score(X_train,y_train)

0.9677521528258933

In [34]:
gs_rf3.score(X_test,y_test)

0.6718332442544094