In [1]:
import pandas as pd
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

In [2]:
#Ignoring warnings about memory from running gridsearch with n_jobs=-1
warnings.filterwarnings('ignore')

In [3]:
fitness_df = pd.read_csv('../Data/fitness_clean.csv', index_col=0)
bodyweight_df = pd.read_csv('../Data/bodyweight_clean.csv', index_col=0)

all_posts = pd.concat([fitness_df, bodyweight_df], ignore_index=True)

In [4]:
# null model accuracy
all_posts['subreddit'].value_counts(normalize=True)

bodyweightfitness    0.512388
Fitness              0.487612
Name: subreddit, dtype: float64

In [5]:
X = all_posts['selftext']
y = all_posts['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Gridsearch Hyperparameters

In [6]:
pipe = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_jobs=-1))
params = {
    'tfidfvectorizer__max_features': [400, 800, 1000],
    'tfidfvectorizer__ngram_range': [(1,1), (1,2)],
    'tfidfvectorizer__stop_words': [None, 'english'],
    'randomforestclassifier__n_estimators': [50, 100, 150]
}
grid = GridSearchCV(pipe, params)

In [7]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(n_jobs=-1))]),
             param_grid={'randomforestclassifier__n_estimators': [50, 100, 150],
                         'tfidfvectorizer__max_features': [400, 800, 1000],
                         'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
                         'tfidfvectorizer__stop_words': [None, 'english']})

In [8]:
grid.score(X_train, y_train)

0.9971904669637667

In [9]:
grid.score(X_test, y_test)

0.7898866608544028

In [10]:
grid.best_params_

{'randomforestclassifier__n_estimators': 150,
 'tfidfvectorizer__max_features': 1000,
 'tfidfvectorizer__ngram_range': (1, 2),
 'tfidfvectorizer__stop_words': 'english'}

# Detailed Random Forest Search
Use the TF-IDF parameters found in previous search and search just the random forest parameters.

In [11]:
pipe2 = make_pipeline(
    TfidfVectorizer(max_features=1000,
                    ngram_range=(1,2),
                    stop_words='english'), 
    RandomForestClassifier()
)

params2 = {
    'randomforestclassifier__n_estimators': [50, 100, 500],
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__max_depth': [2, 3, 4, 5],
    'randomforestclassifier__ccp_alpha': [.01, .1, 1, 10]
}
grid2 = GridSearchCV(pipe2, params2, n_jobs=-1)

In [12]:
grid2.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(max_features=1000,
                                                        ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__ccp_alpha': [0.01, 0.1, 1,
                                                               10],
                         'randomforestclassifier__criterion': ['gini',
                                                               'entropy'],
                         'randomforestclassifier__max_depth': [2, 3, 4, 5],
                         'randomforestclassifier__n_estimators': [50, 100,
                                                                  500]})

In [13]:
grid2.score(X_train, y_train)

0.7416198411160628

In [14]:
grid2.score(X_test, y_test)

0.7445510026155188

In [15]:
grid2.best_params_

{'randomforestclassifier__ccp_alpha': 0.01,
 'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__max_depth': 5,
 'randomforestclassifier__n_estimators': 500}

#### Conclusions
Further parameter searching of the random forest eliminated overfitting, but the accuracy of the test set was slightly reduced.  Further parameter searching could improve accuracy, but it doesn't appear likely that it will perform significantly better than other models.