In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
#Ignoring warnings about memory from running gridsearch with n_jobs=-1
warnings.filterwarnings('ignore')

In [3]:
fitness_df = pd.read_csv('../Data/fitness_clean.csv', index_col=0)
bodyweight_df = pd.read_csv('../Data/bodyweight_clean.csv', index_col=0)

all_posts = pd.concat([fitness_df, bodyweight_df], ignore_index=True)

In [5]:
# null model accuracy
all_posts['subreddit'].value_counts(normalize=True)

bodyweightfitness    0.512388
Fitness              0.487612
Name: subreddit, dtype: float64

In [6]:
X = all_posts['selftext']
y = all_posts['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Logistic Regression

In [7]:
pipe = make_pipeline(TfidfVectorizer(stop_words='english'), StandardScaler(with_mean=False), LogisticRegression(max_iter=10_000))
params = {
    'tfidfvectorizer__max_features': [800, 1000],
    'tfidfvectorizer__ngram_range': [(1,1), (1,2)],
    'logisticregression__C': [0.001, 0.01, 0.1]
}
grid = GridSearchCV(pipe, params, n_jobs=-1)

In [8]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(stop_words='english')),
                                       ('standardscaler',
                                        StandardScaler(with_mean=False)),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.001, 0.01, 0.1],
                         'tfidfvectorizer__max_features': [800, 1000],
                         'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)]})

In [9]:
grid.score(X_train, y_train)

0.8156365045533811

In [10]:
grid.score(X_test, y_test)

0.7846556233653008

In [11]:
grid.best_params_

{'logisticregression__C': 0.001,
 'tfidfvectorizer__max_features': 800,
 'tfidfvectorizer__ngram_range': (1, 1)}

# KNN

In [12]:
pipe = make_pipeline(TfidfVectorizer(stop_words='english'), StandardScaler(with_mean=False), KNeighborsClassifier())
params = {
    'tfidfvectorizer__max_features': [800, 1000],
    'tfidfvectorizer__ngram_range': [(1,1), (1,2)],
    'kneighborsclassifier__n_neighbors': [3, 5, 15]
}
grid = GridSearchCV(pipe, params, n_jobs=-1)

In [13]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(stop_words='english')),
                                       ('standardscaler',
                                        StandardScaler(with_mean=False)),
                                       ('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'kneighborsclassifier__n_neighbors': [3, 5, 15],
                         'tfidfvectorizer__max_features': [800, 1000],
                         'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)]})

In [14]:
grid.score(X_train, y_train)

0.5863204805270297

In [15]:
grid.score(X_test, y_test)

0.5553618134263295

# GridSearch Multiple Estimators

In [31]:
pipe = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english', max_features=1000)),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', MultinomialNB())
])

In [32]:
params = [
    {
        'clf': [MultinomialNB()]
    }, {
        'clf': [DecisionTreeClassifier()]
    }, {
        'clf': [RandomForestClassifier()]
    }, {
        'clf': [SVC()]
    }
]

In [33]:
grid = GridSearchCV(pipe, params, n_jobs=-1)

In [34]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(max_features=1000,
                                                        stop_words='english')),
                                       ('scaler',
                                        StandardScaler(with_mean=False)),
                                       ('clf', MultinomialNB())]),
             n_jobs=-1,
             param_grid=[{'clf': [MultinomialNB()]},
                         {'clf': [DecisionTreeClassifier()]},
                         {'clf': [RandomForestClassifier()]},
                         {'clf': [SVC()]}])

In [35]:
grid.score(X_train, y_train)

0.9975779887618679

In [36]:
grid.score(X_test, y_test)

0.7916303400174368

In [38]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.802243,0.040275,0.219029,0.004551,MultinomialNB(),{'clf': MultinomialNB()},0.753511,0.77046,0.763081,0.747578,0.777132,0.762352,0.010781,3
1,2.494947,0.028726,0.200596,0.006785,DecisionTreeClassifier(),{'clf': DecisionTreeClassifier()},0.686683,0.709927,0.717539,0.694283,0.715116,0.70471,0.012114,4
2,7.596085,0.064968,0.208224,0.015616,RandomForestClassifier(),{'clf': RandomForestClassifier()},0.784504,0.79661,0.782946,0.772287,0.783915,0.784052,0.007715,1
3,15.062987,0.128496,2.934934,0.034718,SVC(),{'clf': SVC()},0.764649,0.776755,0.775678,0.745155,0.77907,0.768261,0.012577,2


## Conclusions
Since the task is to predict which of two subreddits a given post belongs to with no preference for either, I will use basic accuracy as my primary metric.  In addition, the classes are well balanced which also lends itself to using accuracy as the main metric.  Based on the accuracy scores, I will focus on LogisticRegression, MultinomialNB, RandomForestClassifier, and SVC. 