In [1]:
import pandas as pd
import warnings
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

In [2]:
#Ignoring warnings about memory from running gridsearch with n_jobs=-1
warnings.filterwarnings('ignore')

In [3]:
fitness_df = pd.read_csv('../Data/fitness_clean.csv', index_col=0)
bodyweight_df = pd.read_csv('../Data/bodyweight_clean.csv', index_col=0)

all_posts = pd.concat([fitness_df, bodyweight_df], ignore_index=True)

In [4]:
# Baseline
all_posts['subreddit'].value_counts(normalize=True)

bodyweightfitness    0.512388
Fitness              0.487612
Name: subreddit, dtype: float64

In [5]:
X = all_posts['selftext']
y = all_posts['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Grid Search Basic MultinomialNB

In [6]:
pipe = make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False), MultinomialNB())
params = {
    'tfidfvectorizer__max_features': [None, 400, 800, 1000],
    'tfidfvectorizer__ngram_range': [(1,1), (1,2)],
    'tfidfvectorizer__stop_words': [None, 'english'],
    'multinomialnb__alpha': [0.001, 0.01, 1]
}
grid = GridSearchCV(pipe, params, n_jobs=-1)

In [7]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                                       ('standardscaler',
                                        StandardScaler(with_mean=False)),
                                       ('multinomialnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'multinomialnb__alpha': [0.001, 0.01, 1],
                         'tfidfvectorizer__max_features': [None, 400, 800,
                                                           1000],
                         'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
                         'tfidfvectorizer__stop_words': [None, 'english']})

In [8]:
grid.score(X_train, y_train)

0.7872505328424724

In [9]:
grid.score(X_test, y_test)

0.7858180761406568

In [10]:
grid.best_params_

{'multinomialnb__alpha': 0.001,
 'tfidfvectorizer__max_features': 1000,
 'tfidfvectorizer__ngram_range': (1, 2),
 'tfidfvectorizer__stop_words': 'english'}

In [11]:
imports = pd.DataFrame(
    {
        'feature': grid.best_estimator_.named_steps['tfidfvectorizer'].get_feature_names(),
        'coef': grid.best_estimator_.named_steps['multinomialnb'].coef_[0]
    }
)

In [12]:
imports.sort_values(by='coef', ascending=False).head(10).T

Unnamed: 0,226,917,675,496,455,924,678,683,267,117
feature,doing,ups,pull,like,just,ve,pull ups,push,exercises,body
coef,-5.3992,-5.41054,-5.44875,-5.53987,-5.54891,-5.63207,-5.64707,-5.69612,-5.71262,-5.72749


In [13]:
imports.sort_values(by='coef', ascending=False).tail(10).T

Unnamed: 0,137,535,530,168,425,426,989,415,414,416
feature,cable,maintenance,machines,com,imgur,imgur com,www,https imgur,https,https www
coef,-8.54979,-8.73315,-8.82389,-8.99249,-9.43113,-10.4292,-20.4099,-20.4099,-20.4099,-20.4099


#### Results
Basic MultinomialNB scores similarly to many of the other models tested on accuracy.  Looking at the feature importances based off of the coefficients, we see many terms related to fitness and bodyweight fitness.  However, without preprocessing there are also many general terms with high influence likely due to their prevelence in the posts.

## Custom Preprocessing

In [18]:
def my_preprocessor(text):
    """Process text for use with vectorizer.
    
    Takes in a string and modifies it for use with count or tf-idf
    vectorizer.  It will set all characters to lowercase, remove and urls,
    remove the apostrophe from contractions, and replace non-alphanumeric
    characters with a space.
    
    Args:
        text (string): string to be processed
    
    Returns:
        string: processed text string
    """
    text = text.lower()
    text = re.sub('http\S+|www.\S+', '', text)
    text = text.replace("'", '')
    text = ''.join(w if w.isalnum() or w == ' ' else ' ' for w in text)
    
    return text

In [17]:
pipe2 = make_pipeline(TfidfVectorizer(preprocessor=my_preprocessor), 
                      StandardScaler(with_mean=False), MultinomialNB())
grid2 = GridSearchCV(pipe2, params, n_jobs=-1)

In [18]:
grid2.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(preprocessor=<function my_preprocessor at 0x7fe4111cbee0>)),
                                       ('standardscaler',
                                        StandardScaler(with_mean=False)),
                                       ('multinomialnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'multinomialnb__alpha': [0.001, 0.01, 1],
                         'tfidfvectorizer__max_features': [None, 400, 800,
                                                           1000],
                         'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
                         'tfidfvectorizer__stop_words': [None, 'english']})

In [19]:
grid2.score(X_train, y_train)

0.7834721953109862

In [20]:
grid2.score(X_test, y_test)

0.7646033129904097

In [21]:
grid2.best_params_

{'multinomialnb__alpha': 0.001,
 'tfidfvectorizer__max_features': 1000,
 'tfidfvectorizer__ngram_range': (1, 2),
 'tfidfvectorizer__stop_words': 'english'}

## Custom Tokenizer
Testing model with both a lemmatizer and stemmer

In [19]:
def my_lemmatizer(text):
    """lemmatizer wrapper for use with vectorizer"""
    wnet = WordNetLemmatizer()
    return [wnet.lemmatize(w) for w in word_tokenize(text)]

In [20]:
def my_stemmer(text):
    """PoerterStemmer wrapper for use with vectorizer"""
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

In [21]:
# apply same pre-processing as posts
# inspiration from https://stackoverflow.com/questions/50155188/lemmatization-on-countvectorizer-doesnt-remove-stopwords
wnet = WordNetLemmatizer()
lem_stopwords = [wnet.lemmatize(w.replace("'",'')) for w in stopwords.words('english')]

stemmer = PorterStemmer()
stem_stopwords = [stemmer.stem(w.replace("'",'')) for w in stopwords.words('english')]

In [22]:
pipe3 = make_pipeline(
    TfidfVectorizer(preprocessor=my_preprocessor,
                    tokenizer=my_lemmatizer), 
    StandardScaler(with_mean=False), 
    MultinomialNB()
)

params3 = {
    'tfidfvectorizer__max_features': [None, 400, 800, 1000],
    'tfidfvectorizer__ngram_range': [(1,1), (1,2)],
    'tfidfvectorizer__stop_words': [None, lem_stopwords],
    'multinomialnb__alpha': [0.001, 0.01, 1]
}

grid3 = GridSearchCV(pipe3, params3, n_jobs=-1)

In [23]:
grid3.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(preprocessor=<function my_preprocessor at 0x7fe9191fbc10>,
                                                        tokenizer=<function my_lemmatizer at 0x7fe90820fe50>)),
                                       ('standardscaler',
                                        StandardScaler(with_mean=False)),
                                       ('multinomialnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'multinomialnb__alpha': [0.001, 0.01, 1],
                         'tfidfvectorizer__max_features': [None, 400, 800,
                                                           1000],
                         'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
                         'tfidfvectorizer__stop_words': [None,
                                                         ['i', 'me', 'my',
                                                          'm

In [24]:
grid3.score(X_train, y_train)

0.7757217593489634

In [25]:
grid3.score(X_test, y_test)

0.7695437372856728

Best Params:  
{'multinomialnb__alpha': 0.001,  
 'tfidfvectorizer__max_features': 1000,  
 'tfidfvectorizer__ngram_range': (1, 2),  
 'tfidfvectorizer__stop_words': lem_stopwords}

In [27]:
imports3 = pd.DataFrame(
    {
        'feature': grid3.best_estimator_.named_steps['tfidfvectorizer'].get_feature_names(),
        'coef': grid3.best_estimator_.named_steps['multinomialnb'].coef_[0]
    }
)

In [28]:
imports3.sort_values(by='coef', ascending=False).head(10).T

Unnamed: 0,925,694,448,520,300,365,697,985,979,628
feature,ups,pull,im,like,exercise,get,pull ups,would,workout,one
coef,-5.49964,-5.53252,-5.60964,-5.6235,-5.65611,-5.7207,-5.73522,-5.75209,-5.76159,-5.769


In [29]:
imports3.sort_values(by='coef', ascending=False).tail(10).T

Unnamed: 0,119,156,550,50,150,747,240,623,179,556
feature,ate,bike,machine,4 x,bf,rice,db,ohp,cable,maintenance
coef,-8.1613,-8.26243,-8.26503,-8.26856,-8.33862,-8.35739,-8.46929,-8.59105,-8.74949,-8.81721


### Results
Adding in custom preprocessing and lemmetization seems to slightly decrease the models accuracy, but remove many of the general terms from the most important features based on the log probabilities of each feature.  Some of the features identified this way are terms that one would expect to be more associated with fitness or bodyweight fitness, but many still seem fairly generalized.