<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 3: Web APIs & NLP

## Contents
- [Logistic Regression](#Logistic-Regression)
- [Random Forest](#Random-Forest)
- [SVM](#SVM)
- [Model Evaluation](#Model-Evaluation)
- [Conclusions & Recommendations](#Conclusions-&-Recommendations)

--- 
# Part 5 : Modeling & Results

--- 

In [59]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from skopt.space import Integer, Real, Categorical
from scipy.stats import uniform, loguniform
import json

In [60]:
# Read in CSV & create df

df = pd.read_csv('./data/cleaned_subreddit_data.csv')

In [61]:
# Read in custom stop words json as a list

with open('./data/more_stopwords.json', 'r') as infile:
    more_stopwords = json.load(infile)
more_stopwords = more_stopwords['words']

In [62]:
df.head()

Unnamed: 0,author,num_comments,score,subreddit,timestamp,all_text
0,Erahth,12,1,MaliciousCompliance,2021-08-01 00:11:01,one more sip so this just happened my 3 5yo so...
1,MorrisonsLament,39,1,MaliciousCompliance,2021-08-01 00:12:45,you can t fire me but you can make me stop wo...
2,infiniteknights,215,1,MaliciousCompliance,2021-08-01 00:21:24,personal responsibility ok i ve been doing al...
3,SimRayB,19,1,MaliciousCompliance,2021-08-01 00:42:46,you put all of those in your mouth or you can ...
4,CSPhCT,28,1,MaliciousCompliance,2021-08-01 06:01:07,patient wants what he wants so i just need to ...


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11189 entries, 0 to 11188
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author        11189 non-null  object
 1   num_comments  11189 non-null  int64 
 2   score         11189 non-null  int64 
 3   subreddit     11189 non-null  object
 4   timestamp     11189 non-null  object
 5   all_text      11189 non-null  object
dtypes: int64(2), object(4)
memory usage: 524.6+ KB


In [64]:
# Create dummified target column

#df['subreddits'] = df['subreddit'].map({'MaliciousCompliance':0, 'pettyrevenge':1, 'ProRevenge':2})

In [65]:
# Baseline
df['subreddit'].value_counts(normalize = True)

MaliciousCompliance    0.473948
pettyrevenge           0.300295
ProRevenge             0.225757
Name: subreddit, dtype: float64

In [70]:
# Setting up our data for modeling

X = df['all_text']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

## Logistic Regression

In [72]:
# Set a pipeline
pipe = Pipeline([
    ('ss', StandardScaler(with_mean=False) ),
    ('cvec', CountVectorizer(stop_words = more_stopwords) ),
    ('lr', LogisticRegression(penalty='l1', solver='liblinear', C = 1, max_iter = 5000, random_state=42))
])

# Search over the following values of hyperparameters:
pipe_params = {
    'cvec__max_features': [2000, 3000, 4000, 5000],
    'cvec__min_df': [2, 4],
    'cvec__max_df': [.8, .9],
    'cvec__ngram_range': [(1,1), (1,2)]
}

# Instantiate GridSearchCV
gs = GridSearchCV(pipe, 
                  pipe_params, 
                  cv =5) 

In [73]:
# Fit GridSearch to training data

gs.fit(X_train, y_train)

Traceback (most recent call last):
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 702, in fit_transform
    return self.fit(

Traceback (most recent call last):
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 702, in fit_transform
    return self.fit(

Traceback (most recent call last):
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Users/emilysiegel/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 702, in fit_transform
    return self.fit(

ValueError: could not convert string to float: 'no phone use in the aircraft operating area no exceptions fine we will do it your way alright this is posted again due to popular demand from a similar phone based malicious pliance story that i responded to there will be a tl dr at the bottom of note i am on mobile and college educated so all formatting and spelling errors are my own anyways here we go before i had all of my pilot ratings and my job flying i was working as a quality control supervisor for a fueling pany at a major airport in the us my job had me as the person ultimately on the hook for the correct and safe operation of all fuel equipment for the pany as well as one of two responsible for the personnel training to use said equipment basically 80 pieces of machinery and 160 people were my problem at any given time sure i had bosses that had responsibilities but the poop ran downhill to me and the fuelers bitched up to my level i got it from both sides standard policy for being on the aoa aircraft operating area i e the ramp is absolutely no cell phone use this is for safety as that environment is full of hazards and even a momentary distraction can be disastrous well as i supervised the entire operation i couldn t simultaneously monitor the 6 different radio channels we had as such my work phone was routinely my munication device as i would get calls from fuelers leads management maintenance airline operations departments fire inspector parts suppliers etc heck i even had my own secret service agent to call for vip flights not just air force one but any visiting head of state to say my work phone was an electronic leash would be an understatement i was standing near the terminal building in an area far removed from aircraft but still part of the aoa and the ramp patrol airport pliance officers essentially saw me didn t care for my reasons and issued me a citation get too many of these and your id is revoked effectively getting yourself fired ok no phone on the ramp gotcha covered boss pretty soon airline reps couldn t reach me and as such aircraft delays went up the fire inspector couldn t reach me so equipment was tagged out of service for minor things easily fixed the airport duty manager adm person in charge of the operations of the airport couldn t reach me so all hell broke loose i told everyone when asked that i was merely plying with the rules as i didn t want any more points on my id as i prided myself on rules pliance sort of have to when your business card says quality control supervisor about three weeks into all of this with seven delays 10nin a year is a bad thing already hitting us the adm calls me and asks if i was available that afternoon i go up to his office and he explains that they are amending the rules so as to allow the use of cellphones in designated areas away from aircraft as well as when in a stationary vehicle also my citation would be wiped now would i please start carrying my phone i laughed for a while at that small victory tl dr work provided me a work phone and it was constantly being used to run the qualoty pliance of a pany fueling aircraft at a major us airport airport rules say no phones on the tarmac area no exceptions after three weeks of me being unreachable and that causing millions of dollars in contract fines the airport authority amends the policy to allow phones in certain safe areas '

In [None]:
# What's the best score?

gs.score(X_train, y_train), gs.score(X_test, y_test)

## Random Forest

## SVM

In [None]:
rs_pipe = Pipeline([
    ('ss',StandardScaler()),
    ('svc', SVC(random_state = 42))
])

rs_params = {
    'svc__C': loguniform(1e-5,1e+2), # was: np.logspace(-5,2, 10)
    'svc__kernel': ['poly','rbf'],
    'svc__gamma': ['scale','auto'],
    'svc__degree': list(np.linspace(2,10,9)), # same as before because we need integers
    'svc__coef0': uniform(0,1), # was: np.linspace(0,1, 5),
    'svc__shrinking': [True, False],   
}

svc_rs = RandomizedSearchCV(estimator = rs_pipe,
                     param_distributions = rs_params,
                     scoring = 'f1_weighted',
                     n_iter = 2000,
                     n_jobs = -2,
                     cv = 5,
                     verbose = 1)

In [None]:
svc_rs.fit(X_train, y_train)

In [None]:
svc_rs.best_score_

In [None]:
plot_confusion_matrix(svc_rs.best_estimator_, X_train, y_train)
plt.title('Confusion Matrix: Train Data (RS)');

In [None]:
plot_confusion_matrix(svc_rs.best_estimator_, X_test, y_test)
plt.title('Confusion Matrix: Test Data (RS)');

In [None]:
cs_rs = svc_rs.cv_results_['param_svc__C'].data

In [None]:
cv_scores_rs = svc_rs.cv_results_['mean_test_score']

In [None]:
svc_rs.best_params_

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(cs_rs, cv_scores_rs)
plt.title('Values of C vs Cross-Validated Test F1 Score')
plt.ylim(0,1)
plt.xscale('log')
plt.xlabel('C (log scale)')
plt.ylabel('CV Test F1')
plt.vlines(svc_rs.best_params_['svc__C'], 0, 1, color='red');

## Model Evaluation

## Conclusions & Recommendations