# Project 3: Web APIs & Classification
### Notebook 3: Modeling

In [1]:
### import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

#To visualize the whole grid
pd.options.display.max_columns = 999

  import pandas.util.testing as tm


## 6. Modeling

### 6.1 Reload the cleaned post dataframe

In [2]:
X_train = pd.read_csv('../datasets/X_train.csv', index_col=0)
X_test = pd.read_csv('../datasets/X_test.csv', index_col=0)
y_train = pd.read_csv('../datasets/y_train.csv', index_col=0)
y_test = pd.read_csv('../datasets/y_test.csv', index_col=0)

In [3]:
### Drop the feature ['post_words'], that used to count the number of word in the post as it will not be used in modeling
### change the y_train, y_test to series
X_train_post = X_train['cleaned_post']
X_test_post = X_test['cleaned_post']
y_train = y_train['subreddit']
y_test = y_test['subreddit']

### 6.2 Simple model
Use the naive CountVectorizer and logistic regression to get the simple model.

Results:
- Train score : 1.0
- Test score : 0.94
There seems to have overfit. However, by just using the CountVectorizer and logistic regression model with default hyperparameter, it is able to predict well which subreddit the post came from.


In [4]:
#Instantiate and fit the model with CountVectorizer with stop words enable
cvec_naive = CountVectorizer(stop_words = 'english')
# fit & transform
X_train_cvec_naive = cvec_naive.fit_transform(X_train['cleaned_post']).todense()
X_test_cvec_naive = cvec_naive.transform(X_test['cleaned_post']).todense()

In [5]:
# Instantiate Logistic Regression and fit the model
lr = LogisticRegression()
lr.fit(X_train_cvec_naive, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
# Get the score from simple logistic regression
print(f" Train score: {lr.score(X_train_cvec_naive, y_train)}")
print(f" Test score: {lr.score(X_test_cvec_naive, y_test)}")

 Train score: 1.0
 Test score: 0.9472693032015066


In [113]:
### There are total of 25730 feature words
a = cvec_naive.get_feature_names()
len(a)

25730

### 6.3 GridSearchCV to find the optimal hyperparameters

Vectorizer extraction techniques to use:
- CountVectorizer
- TfidfVectorizer

For each vectorizer, classification models to evaluate:
- Multinomial Naive Bayes
- Logistic Regression

### 6.3.1 CountVectorizer

#### 6.3.1.1 Logistic Regression
Run a CountVectorizer and Logistic Regression regression in a pipeline and grid search.

In [7]:
### create the pipeline to include all the step needed to run

pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = 'english')),
    ('lr', LogisticRegression()),
    ])

In [8]:
### set the params that include the hyperparameter that want to optimize
pipe_params = {
    'cvec__max_features': [1000, 2000, 6000],
    'cvec__min_df': [2, 3],   
    'cvec__max_df': [.5, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'lr__penalty' : ['l1', 'l2']
}

In [9]:
### Gridsearch
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, verbose = 1)
gs.fit(X_train_post, y_train)


Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.






[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:  3.9min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [120]:
### get the best score and its corresponding optimal parameter from the gridsearch
print(f'Best score from Vectorizer and lr: {gs.best_score_}')
gs.best_params_        # get the optimal hyperparameters

Best score from Vectorizer and lr: 0.9369951534733441


{'cvec__max_df': 0.5,
 'cvec__max_features': 6000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'lr__penalty': 'l2'}

In [58]:
### get the Accuracy score for both training set and test set data
print(f'Train accuracy: {gs.score(X_train_post, y_train)}')
print(f'Test accuracy: {gs.score(X_test_post, y_test)}')

Train accuracy: 1.0
Test accuracy: 0.943502824858757


In [55]:
### Review the coefficient of the model (cvec + logistic regression)
cvec_lr_coef_name = gs.best_estimator_[0].get_feature_names()      # best_estimator_[0] store 'cvec'
cvec_lr_coef = gs.best_estimator_[1].coef_[0]                   #best_estimator_[1] store 'lr'

cvec_lr_coef = list(zip(cvec_lr_coef_name, cvec_lr_coef))

cvec_lr_coef_df = pd.DataFrame(cvec_lr_coef, columns = ['word','coef'])

In [56]:
### The coefficient that best predict the subreddit 'Nosleep' (y-target =1)
cvec_lr_coef_df.sort_values(by = 'coef', ascending = False)[:15].T

Unnamed: 0,3856,4944,428,5709,5508,3459,1077,389,2471,4710,1196,2924,4670,3228,1549
word,police,started,better,voice,tried,new,creature,began,hour,skin,dead,let,silence,matter,end
coef,0.409864,0.374562,0.368311,0.35967,0.349282,0.335948,0.325015,0.321482,0.321106,0.309593,0.307868,0.296743,0.29277,0.288007,0.287729


In [57]:
### The coefficient that best predict the subreddit 'Thetrueishere' (y-target =0)
cvec_lr_coef_df.sort_values(by = 'coef', ascending = True)[:15].T

Unnamed: 0,1644,3886,5825,3667,1547,2285,5690,3753,5732,85,2490,5990,381,3344,2859
word,experience,post,weird,paranormal,encounter,happened,video,person,walk,ago,http,youtu,bedroom,mom,later
coef,-0.748973,-0.414891,-0.356698,-0.354782,-0.347735,-0.313096,-0.28005,-0.266662,-0.262718,-0.261741,-0.257834,-0.256161,-0.242709,-0.236814,-0.236279


In [60]:
#### Check confusion matrix results, which consist of TP, TN, FP, FN
#tn1,fp1,fn1,tp1 = confusion_matrix(y_test, lr_optimal.predict(X_test_cvec)).ravel()
tn1,fp1,fn1,tp1 = confusion_matrix(y_test, gs.predict(X_test_post)).ravel()
print(f'True Positive, tp: {tp1}')
print(f'False Positive, fp: {fp1}')
print(f'True Negative, tn: {tn1}')
print(f'False Negative, fn: {fn1}')
print(f'Sensitivity : {tp1/(tp1+fn1)}')
print(f'Specificity : {tn1/(tn1+fp1)}')
print(f'roc_auc_score : {roc_auc_score(y_test, gs.predict(X_test_post))}')

True Positive, tp: 228
False Positive, fp: 8
True Negative, tn: 273
False Negative, fn: 22
Sensitivity : 0.912
Specificity : 0.9715302491103203
roc_auc_score : 0.94176512455516


#### 6.3.1.2  Multinomial Naive Bayes
By using Vectorizer, the features column are all interger counts, thus MultinomialNB model is chosen.

In [61]:
### create the pipeline to include all the step needed to run
### Use the optimal cvec parameter in the pipeline

pipe_mnb = Pipeline([
    ('cvec', CountVectorizer( stop_words = 'english',
                                max_df = 0.5,
                                max_features = 6000,
                                min_df =2,
                                ngram_range = (1,2))),
    ('mnb', MultinomialNB()),
    ])

In [62]:
### set the params that include the hyperparameter that want to optimize
pipe_mnb_params = {'mnb__alpha':[0.01, 0.5, 1.0]}


In [63]:
gs_mnb = GridSearchCV(pipe_mnb, param_grid=pipe_mnb_params, cv=3, verbose = 1)
gs_mnb.fit(X_train_post, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   24.8s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=6000,
                                                        min_df=2,
                                                        ngram_range=(1, 2),
                                            

In [134]:
### get the best score and its corresponding optimal parameter from the gridsearch
print(f'Best score from Vectorizer and mnb: {gs_mnb.best_score_}')
gs_mnb.best_params_        # get the optimal hyperparameters

Best score from Vectorizer and mnb: 0.9192245557350566


{'mnb__alpha': 0.5}

In [64]:
# get the Accuracy score for both training set and test set data
print(f'Train accuracy: {gs_mnb.score(X_train_post, y_train)}')
print(f'Test accuracy: {gs_mnb.score(X_test_post, y_test)}')

Train accuracy: 0.9547657512116317
Test accuracy: 0.9209039548022598


In [72]:
### Review the coefficient of the model (cvec + naive_bayes.MultinomialNB)
cvec_mnb_coef_name = gs_mnb.best_estimator_[0].get_feature_names()
cvec_mnb_coef = gs_mnb.best_estimator_[1].coef_[0]

cvec_mnb_coef = list(zip(cvec_mnb_coef_name, cvec_mnb_coef))

cvec_mnb_coef_df = pd.DataFrame(cvec_mnb_coef, columns = ['word','coef'])

In [73]:
### There are the coefficient that best predict the subreddit 'Nosleep' (y-target =1)
cvec_mnb_coef_df.sort_values(by = 'coef', ascending = False)[:15].T

Unnamed: 0,1382,4305,2273,1697,3181,2320,3075,1794,3733,3163,230,3065,3026,4944,5548
word,door,room,hand,face,man,head,look,felt,people,make,asked,long,little,started,turned
coef,-5.23236,-5.34431,-5.47767,-5.54744,-5.55288,-5.60381,-5.65143,-5.7092,-5.77959,-5.79567,-5.81437,-5.84552,-5.85203,-5.85448,-5.85941


In [74]:
### There are the coefficient that best predict the subreddit 'Thetrueishere' (y-target =0)
cvec_mnb_coef_df.sort_values(by = 'coef', ascending = True)[:15].T

Unnamed: 0,5999,3808,456,3668,5402,5401,3117,2556,2557,4575,2492,1992,5766,2538,5874
word,zozo,placebo,black man,paranormal experience,tl dr,tl,love hear,imgur,imgur com,share experience,http imgur,ft,wanted share,idk,wiki
coef,-13.6555,-13.6555,-13.6555,-13.6555,-13.6555,-12.5569,-12.5569,-12.5569,-12.5569,-12.5569,-12.5569,-12.5569,-12.046,-12.046,-12.046


In [75]:
#### Check confusion matrix results, which consist of TP, TN, FP, FN
tn2,fp2,fn2,tp2 = confusion_matrix(y_test, gs_mnb.predict(X_test_post)).ravel()
print(f'True Positive, tp: {tp2}')
print(f'False Positive, fp: {fp2}')
print(f'True Negative, tn: {tn2}')
print(f'False Negative, fn: {fn2}')
print(f'Sensitivity : {tp2/(tp2+fn2)}')
print(f'Specificity : {tn2/(tn2+fp2)}')
print(f'roc_auc_score : {roc_auc_score(y_test, gs_mnb.predict(X_test_post))}')

True Positive, tp: 227
False Positive, fp: 19
True Negative, tn: 262
False Negative, fn: 23
Sensitivity : 0.908
Specificity : 0.9323843416370107
roc_auc_score : 0.9201921708185052


### 6.3.2 TfidfVectorizer

#### 6.3.2.1 Logistic Regression
Run a TfidfVectorizer and Logistic Regression regression in a pipeline and grid search.

In [76]:
### create the pipeline to include all the step needed to run

pipe2 = Pipeline([
    ('tvec', TfidfVectorizer(stop_words = 'english')),
    ('lr', LogisticRegression()),
    ])

In [77]:
### set the params that include the hyperparameter that want to optimize
pipe2_params = {
    'tvec__max_features': [1000, 2000, 6000],
    'tvec__min_df': [2, 3],   
    'tvec__max_df': [.5, .95],
    'tvec__ngram_range': [(1,1), (1,2)],
    'lr__penalty' : ['l1', 'l2']
}

In [78]:
### Gridsearch
gs2 = GridSearchCV(pipe2, param_grid=pipe2_params, cv=3, verbose = 1)
gs2.fit(X_train_post, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.






[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:  4.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [79]:
### get the best score and its corresponding optimal parameter from the gridsearch
print(f'Best score from TfidfVectorizer and lr: {gs2.best_score_}')
gs2.best_params_        # get the optimal hyperparameters

Best score from TfidfVectorizer and lr: 0.9410339256865913


{'lr__penalty': 'l2',
 'tvec__max_df': 0.5,
 'tvec__max_features': 6000,
 'tvec__min_df': 3,
 'tvec__ngram_range': (1, 2)}

In [80]:
### get the Accuracy score for both training set and test set data
print(f'Train accuracy: {gs2.score(X_train_post, y_train)}')
print(f'Test accuracy: {gs2.score(X_test_post, y_test)}')

Train accuracy: 0.9781906300484653
Test accuracy: 0.9378531073446328


In [81]:
### Review the coefficient of the model (tvec + logistic regression)
tvec_lr_coef_name = gs2.best_estimator_[0].get_feature_names()
tvec_lr_coef = gs2.best_estimator_[1].coef_[0]

tvec_lr_coef = list(zip(tvec_lr_coef_name, tvec_lr_coef))

tvec_lr_coef_df = pd.DataFrame(tvec_lr_coef, columns = ['word','coef'])

In [82]:
### The coefficient that best predict the subreddit 'Nosleep' (y-target =1)
tvec_lr_coef_df.sort_values(by = 'coef', ascending = False)[:15].T

Unnamed: 0,1688,2257,5704,482,5418,2305,380,1071,2759,5504,3406,3155,2900,5912,5922
word,face,hand,voice,blood,took,head,began,creature,knew,tried,need,man,let,woman,word
coef,1.67242,1.65893,1.47195,1.36996,1.33651,1.29151,1.29025,1.27412,1.24183,1.23088,1.20773,1.18553,1.13597,1.13006,1.12693


In [83]:
### The coefficient that best predict the subreddit 'Thetrueishere' (y-target =0)
tvec_lr_coef_df.sort_values(by = 'coef', ascending = True)[:15].T

Unnamed: 0,1635,5819,3639,3311,1420,3863,1535,2269,573,1362,1646,5685,4664,1501,1960
word,experience,weird,paranormal,mom,dream,post,encounter,happened,brother,dog,explanation,video,similar,edit,friend
coef,-2.38321,-1.39483,-1.3728,-1.29752,-1.11593,-1.09447,-1.04188,-1.02512,-0.847919,-0.802426,-0.790191,-0.787264,-0.782129,-0.77048,-0.762537


In [84]:
#### Check confusion matrix results, which consist of TP, TN, FP, FN
tn3,fp3,fn3,tp3 = confusion_matrix(y_test, gs2.predict(X_test_post)).ravel()
print(f'True Positive, tp: {tp3}')
print(f'False Positive, fp: {fp3}')
print(f'True Negative, tn: {tn3}')
print(f'False Negative, fn: {fn3}')
print(f'Sensitivity : {tp3/(tp3+fn3)}')
print(f'Specificity : {tn3/(tn3+fp3)}')
print(f'roc_auc_score : {roc_auc_score(y_test, gs2.predict(X_test_post))}')

True Positive, tp: 229
False Positive, fp: 12
True Negative, tn: 269
False Negative, fn: 21
Sensitivity : 0.916
Specificity : 0.9572953736654805
roc_auc_score : 0.9366476868327402


#### 6.3.2.2  Multinomial Naive Bayes
By using TfidfVectorizer, select the MultinomialNB model to proceed with the modeling

In [85]:
### create the pipeline to include all the step needed to run
### Use the optimal cvec parameter in the pipeline

pipe2_mnb = Pipeline([
    ('tvec', TfidfVectorizer( stop_words = 'english',
                                max_df = 0.5,
                                max_features = 6000,
                                min_df =3,
                                ngram_range = (1,2))),
    ('mnb', MultinomialNB()),
    ])

In [86]:
### set the params that include the hyperparameter that want to optimize
pipe2_mnb_params = {'mnb__alpha':[0.01, 0.5, 1.0]}


In [87]:
### GridSearch
gs2_mnb = GridSearchCV(pipe2_mnb, param_grid=pipe2_mnb_params, cv=3, verbose = 1)
gs2_mnb.fit(X_train_post, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   25.7s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=6000,
                                                        min_df=3,
                                                        ngram_range=(1, 2),
                                          

In [88]:
### get the best score and its corresponding optimal parameter from the gridsearch
print(f'Best score from TfidfVectorizer and mnb: {gs2_mnb.best_score_}')
gs2_mnb.best_params_        # get the optimal hyperparameters

Best score from TfidfVectorizer and mnb: 0.932956381260097


{'mnb__alpha': 1.0}

In [89]:
# get the Accuracy score for both training set and test set data
print(f'Train accuracy: {gs2_mnb.score(X_train_post, y_train)}')
print(f'Test accuracy: {gs2_mnb.score(X_test_post, y_test)}')

Train accuracy: 0.9668820678513732
Test accuracy: 0.9303201506591338


In [90]:
### Review the coefficient of the model (tvec + naive_bayes.MultinomialNB)
tvec_mnb_coef_name = gs2_mnb.best_estimator_[0].get_feature_names()
tvec_mnb_coef = gs2_mnb.best_estimator_[1].coef_[0]

tvec_mnb_coef = list(zip(tvec_mnb_coef_name, tvec_mnb_coef))

tvec_mnb_coef_df = pd.DataFrame(tvec_mnb_coef, columns = ['word','coef'])

In [91]:
### The coefficient that best predict the subreddit 'Nosleep' (y-target =1)
tvec_mnb_coef_df.sort_values(by = 'coef', ascending = False)[:15].T

Unnamed: 0,1368,4294,3155,2257,1688,2305,3710,2463,3057,1786,5704,4936,3138,499,2915
word,door,room,man,hand,face,head,people,house,look,felt,voice,started,make,body,life
coef,-6.32245,-6.42297,-6.56355,-6.65716,-6.66813,-6.74485,-6.78098,-6.7871,-6.79402,-6.79897,-6.86093,-6.87573,-6.89779,-6.92302,-6.92437


In [92]:
### The coefficient that best predict the subreddit 'Thetruishere' (y-target =0)
tvec_mnb_coef_df.sort_values(by = 'coef', ascending = True)[:15].T

Unnamed: 0,447,5395,3640,3098,1977,5394,4565,5761,5593,3043,4665,2481,2522,2540,2539
word,black man,tl dr,paranormal experience,love hear,ft,tl,share experience,wanted share,unexplainable,lol,similar experience,http youtu,idk,imgur com,imgur
coef,-9.55292,-9.55292,-9.55292,-9.5386,-9.52975,-9.52517,-9.51894,-9.51642,-9.50983,-9.50793,-9.50754,-9.50533,-9.5021,-9.50174,-9.50174


In [93]:
#### Check confusion matrix results, which consist of TP, TN, FP, FN
tn4,fp4,fn4,tp4 = confusion_matrix(y_test, gs2_mnb.predict(X_test_post)).ravel()
print(f'True Positive, tp: {tp4}')
print(f'False Positive, fp: {fp4}')
print(f'True Negative, tn: {tn4}')
print(f'False Negative, fn: {fn4}')
print(f'Sensitivity : {tp4/(tp4+fn4)}')
print(f'Specificity : {tn4/(tn4+fp4)}')
print(f'roc_auc_score : {roc_auc_score(y_test, gs2_mnb.predict(X_test_post))}')

True Positive, tp: 230
False Positive, fp: 17
True Negative, tn: 264
False Negative, fn: 20
Sensitivity : 0.92
Specificity : 0.9395017793594306
roc_auc_score : 0.9297508896797153


## 7. Model Evaluation

### 7.1 Compile and compare the modeling result with optimal hyperparameters

Results consolidated from **6. Modeling** are:

- Optimal hyperparameter obtained from gridsearch (`Optimal hyperparameter`)
- Accuracy score for train data with optimal hyperparameter (`Train accuracy`)
- Accuracy score for test data with optimal hyperparameter (`Test accuracy`)
- Train accuracy - Test accuracy (`Variance`)
- Confusion matrix for test data from modeling with optimal hyperparameter (`TP`, `TN`, `FP`,`FN`)
- ROC-AUC for test data from modeling with optimal hyperparameter (`ROC_AUC`)
- Test accuracy - baseline accuracy (`Variance from baseline`)


In [102]:
### Consolidate the results listed above

model = ['Logistic Regression', 'MultinomialNB', 'Logistic Regression', 'MultinomialNB']
vectorizer = ['CountVectorizer','CountVectorizer', 'TfidfVectorizer', 'TfidfVectorizer']
optimal_hyperparameter =[gs.best_params_, gs_mnb.best_params_, gs2.best_params_, gs2_mnb.best_params_]

x1, y1, x2, y2 = [X_train_post, y_train, X_test_post, y_test ]
train_accuracy = [gs.score(x1, y1), gs_mnb.score(x1, y1), gs2.score(x1,y1), gs2_mnb.score(x1,y1)]
test_accuracy = [gs.score(x2, y2), gs_mnb.score(x2, y2), gs2.score(x2,y2), gs2_mnb.score(x2,y2)]

tp = [tp1,tp2,tp3,tp4]
tn = [tn1,tn2,tn3,tn4]
fp = [fp1,fp2,fp3,fp4]
fn = [fn1,fn2,fn3,fn4]


In [97]:
variance = []
zip_accuracy = zip(train_accuracy, test_accuracy)
for train, test in zip_accuracy:
    variance.append(train - test)
variance = np.transpose(variance)

In [98]:
roc_auc = [roc_auc_score(y_test, gs.predict(x2)),
          roc_auc_score(y_test, gs_mnb.predict(x2)),
          roc_auc_score(y_test, gs2.predict(x2)),
          roc_auc_score(y_test, gs2_mnb.predict(x2))]

In [105]:
accuracy_ref_baseline = [score - 0.529678 for score in test_accuracy]  

In [106]:
result_compile = list(zip(model, vectorizer, optimal_hyperparameter, train_accuracy, test_accuracy, 
                         variance, accuracy_ref_baseline, tp, tn, fp, fn, roc_auc))

In [107]:
### Convert the compiled results to dataframe:

pd.DataFrame(result_compile, columns = ['Model', 'Vectorizer', 'Optimal hyperparameter', 'Train Accuracy',
                                       'Test Accuracy', 'Variance', 
                                        'Varian ref baseline','TP', 'TN', 'FP', 'FN', 'ROC_AUC'])

Unnamed: 0,Model,Vectorizer,Optimal hyperparameter,Train Accuracy,Test Accuracy,Variance,Varian ref baseline,TP,TN,FP,FN,ROC_AUC
0,Logistic Regression,CountVectorizer,"{'cvec__max_df': 0.5, 'cvec__max_features': 60...",1.0,0.943503,0.056497,0.413825,228,273,8,22,0.941765
1,MultinomialNB,CountVectorizer,{'mnb__alpha': 0.5},0.954766,0.920904,0.033862,0.391226,227,262,19,23,0.920192
2,Logistic Regression,TfidfVectorizer,"{'lr__penalty': 'l2', 'tvec__max_df': 0.5, 'tv...",0.978191,0.937853,0.040338,0.408175,229,269,12,21,0.936648
3,MultinomialNB,TfidfVectorizer,{'mnb__alpha': 1.0},0.966882,0.93032,0.036562,0.400642,230,264,17,20,0.929751


To recap the objective of this project is to use NLP ot rain a classifier on which subreddit a given post came from. The chosen subreddits are:
1. Nosleep (y-target = 1)
2. Thetruishere (y-target = 0)

Accuracy score is use to evaluate how well the classification model perform. This is because there is no greater detriment to false positive (actual post is 'Thetruishere' but predict it come from 'Nosleep' subreddit).

Overall, all model perform well, as it has much higher accuracy score compared to its' baseline accuracy (baseline accuracy: 0.529678).

Logistic Regression with CountVectorizer observed to have overfit compared to the rest, as it has slightly higher variance in accuracy score (train accuracy - test accuracy). 
Multinomial NB tends to be able to generalize slightly better, as it has lower variance.

Decided to scrap some new post to evaluate the above models to check, how well are they perform with unseen future data.


### 7.2 Evaluate the model with newly scrap unseen data (holdout dataset):

In [118]:
### Importing the data:
df_holdout = pd.read_csv('../datasets/subredditsholdout_cleaned.csv')

In [119]:
### Set the X feature and y-predict
X_holdout = df_holdout['Cleaned_post']
y_holdout = df_holdout['subreddit']

In [109]:
### Creating the steps to fit into the model
cv_lr_steps = [('cvec', CountVectorizer(stop_words = 'english',
                                max_df = 0.5,
                                max_features = 6000,
                                min_df =2,
                                ngram_range = (1,2))),
               ('lr', LogisticRegression(penalty= 'l2'))]

cv_mnb_steps = [('cvec', CountVectorizer(stop_words = 'english',
                                max_df = 0.5,
                                max_features = 6000,
                                min_df =2,
                                ngram_range = (1,2))),
               ('mnb', MultinomialNB(alpha= 0.5))]

tv_lr_steps = [('tvec', TfidfVectorizer(stop_words = 'english',
                                max_df = 0.5,
                                max_features = 6000,
                                min_df =3,
                                ngram_range = (1,2))),
               ('lr', LogisticRegression(penalty= 'l2'))]

tv_mnb_steps = [('cvec', TfidfVectorizer(stop_words = 'english',
                                max_df = 0.5,
                                max_features = 6000,
                                min_df =3,
                                ngram_range = (1,2))),
               ('mnb', MultinomialNB(alpha= 0.5))]

In [111]:
### List to execute the modelting
pipe_steps_items = [cv_lr_steps, cv_mnb_steps, tv_lr_steps, tv_mnb_steps]

In [120]:
### Loop to execute the modeling
holdout_score = []      # list to store the accuracy score 
for items in pipe_steps_items:
    steps = items
    pipe = Pipeline(items)
    pipe.fit(X_train_post, y_train)
    holdout_score.append(pipe.score(X_holdout, y_holdout))




In [125]:
### compile the holdout data set accuracy score, compared to test accuracy, and convert to dataframe
holdout_test = ['cvec_lr', 'cvec_mnb', 'tvec_lr', 'tvec_mnb']
holdout_results = list(zip(holdout_test, test_accuracy, holdout_score))
holdout_results_df = pd.DataFrame(holdout_results, columns = ['Modeling','Test Accuracy','Holdout Accuracy'])
holdout_results_df

Unnamed: 0,Modeling,Test Accuracy,Holdout Accuracy
0,cvec_lr,0.943503,0.926667
1,cvec_mnb,0.920904,0.826667
2,tvec_lr,0.937853,0.933333
3,tvec_mnb,0.93032,0.84


Logistic Regression model using TfidfVectorizer performs the best as it is able to predict equally well on the unseen data (accuracy score doesn't deviate much on the holdout accuracy). 

In [139]:
### View the coefficient from the Logistic Regression model using TfidvVectorizer
### step 1, create another dataframe that with coef is sorted

tvec_lr_coef_sorted = tvec_lr_coef_df.sort_values(by = 'coef')
tvec_lr_coef_sorted['odds_coef'] = np.exp(tvec_lr_coef_sorted['coef'])
tvec_lr_coef_sorted.head()

Unnamed: 0,word,coef,odds_coef
1635,experience,-2.38321,0.092254
5819,weird,-1.394827,0.247876
3639,paranormal,-1.3728,0.253396
3311,mom,-1.297518,0.273209
1420,dream,-1.115933,0.32761


In [145]:
# Create another dataframe that
# extract only the top 5 most significant coefficient (for both subreddits)
tvec_lr_coef_odds = pd.concat([tvec_lr_coef_sorted.tail(), tvec_lr_coef_sorted.head()], ignore_index = True)
tvec_lr_coef_odds

Unnamed: 0,word,coef,odds_coef
0,took,1.336512,3.805746
1,blood,1.369955,3.935175
2,voice,1.471948,4.357717
3,hand,1.658934,5.253708
4,face,1.672418,5.325029
5,experience,-2.38321,0.092254
6,weird,-1.394827,0.247876
7,paranormal,-1.3728,0.253396
8,mom,-1.297518,0.273209
9,dream,-1.115933,0.32761


In [3]:
## Plot the most significant coefs
plt.figure(figsize = (10,8));
sns.barplot(data = tvec_lr_coef_odds, x = 'coef', y = 'word', palette = 'GnBu')
plt.title("Top Words for Differentiating Subreddits", fontsize=18)
plt.yticks(size=12)
plt.xticks(size=12)
plt.ylabel("Words", size=14)
plt.xlabel("Coefficient from Logistic Regression \n(Log-Odds)", fontsize=14);
plt.legend(['Nosleep','Thetrueishere'], fontsize=18)

NameError: name 'plt' is not defined

#### What the coefficient tell us?
`face, hand, blood, voice ,took` are the top 5 words that is able to increase the probability of predicting the post is come from Nosleep (y-target =1).
`face` increase by one, the post is 5.3 times as likely to be from 'Nosleep' subreddit.


## 8. Conclusion and Recommendations

Does the student provide appropriate context to connect individual steps back to the overall project?
Is it clear how the final recommendations were reached?
Are the conclusions/recommendations clearly stated?
Does the conclusion answer the original problem statement?
Does the student address how findings of this research can be applied for the benefit of stakeholders?
Are future steps to move the project forward identified?

My second Multinomial Naive Bayes model performed the best. With the best parameters being — alpha=0 and fit_prior=False. The accuracy score was 92.4% on training data and 92.2% on unseen data. This means our model is slightly and probably inconsequentially overfit. This also means that 92.2% of our posts will be accurately classified by our model.