In [4]:
import pandas as pd
import numpy as np
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.svm import SVC

In [5]:
fitness_df = pd.read_csv('../Data/fitness_clean.csv', index_col=0)
bodyweight_df = pd.read_csv('../Data/bodyweight_clean.csv', index_col=0)

all_posts = pd.concat([fitness_df, bodyweight_df], ignore_index=True)

In [6]:
# null model accuracy
all_posts['subreddit'].value_counts(normalize=True)

bodyweightfitness    0.512388
Fitness              0.487612
Name: subreddit, dtype: float64

In [7]:
X = all_posts['selftext']
y = all_posts['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
pipe = make_pipeline(
    TfidfVectorizer(max_features=1000, ngram_range=(1,2),
                    stop_words='english'), 
    StandardScaler(with_mean=False),
    SVC()
)
params = {
    'svc__C': np.logspace(-3, 2, 10),
    'svc__gamma': np.logspace(-5, 2, 10)
    
}
grid = GridSearchCV(pipe, params, n_jobs=-1)

In [13]:
grid.fit(X_train, y_train)



KeyboardInterrupt: 

In [9]:
grid.score(X_train, y_train)

0.9969967060647161

In [10]:
grid.score(X_test, y_test)

0.7916303400174368

In [11]:
grid.best_params_

{'svc__C': 10, 'svc__gamma': 2, 'svc__kernel': 'rbf'}

## Custom Preprocessor

In [7]:
def my_preprocessor(text):
    """Process text for use with vectorizer.
    
    Takes in a string and modifies it for use with count or tf-idf
    vectorizer.  It will set all characters to lowercase, remove and urls,
    remove the apostrophe from contractions, and replace non-alphanumeric
    characters with a space.
    
    Args:
        text (string): string to be processed
    
    Returns:
        string: processed text string
    """
    text = text.lower()
    text = re.sub('http\S+|www.\S+', '', text)
    text = text.replace("'", '')
    text = ''.join(w if w.isalnum() or w == ' ' else ' ' for w in text)
    
    return text

In [18]:
pipe2 = make_pipeline(
    TfidfVectorizer(preprocessor=my_preprocessor,
                    max_features=400, 
                    ngram_range=(1,2),
                    stop_words='english'), 
    StandardScaler(with_mean=False),
    SVC(C=10, gamma=2)
)

In [19]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=400, ngram_range=(1, 2),
                                 preprocessor=<function my_preprocessor at 0x7f8aadcc8310>,
                                 stop_words='english')),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('svc', SVC(C=10, gamma=2))])

In [20]:
pipe2.score(X_train, y_train)

0.9968998256151909

In [21]:
pipe2.score(X_test, y_test)

0.5106073815751235

#### Results
Without any preprocessing, the SVC model is one of the highest scoring models that has been tested.  However, it appears to be getting worse with more processing of the data, and it is not easily interpretable.