In [190]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_transformer
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

## Reading and Preparing Data

In [66]:
# importing formatted data from notebook 1
df = pd.read_csv('../Data/combined_data.csv')

In [67]:
# ensuring import format 
print(df.shape)
print()
print(df.isnull().sum())
df.head(3)

(4000, 14)

title                  0
selftext               0
subreddit              0
author                 0
num_comments           0
score                  0
timestamp              0
days_old               0
all_text               0
all_text_length        0
title_length           0
selftext_length        0
log_all_text_length    0
is_liberal             0
dtype: int64


Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,days_old,all_text,all_text_length,title_length,selftext_length,log_all_text_length,is_liberal
0,It has never been more dangerous to live in a ...,Everyone here says the virus is a hoax. The co...,Liberal,readeachbook,14,1,2020-03-24,29,It has never been more dangerous to live in a ...,502,66,435,6.2186,1
1,The supreme test for Trump supporters,Should they die for Trump by supporting his de...,Liberal,tsdguy,9,1,2020-03-24,29,The supreme test for Trump supporters Should t...,200,37,162,5.298317,1
2,Conservative on Liberal subreddit.,Ayeeeeee...young conservative on liberal subr...,Liberal,Warhound13,52,1,2020-03-24,29,Conservative on Liberal subreddit. Ayeeeeee.....,305,34,270,5.720312,1


In [68]:
# creating column lemmatizer function

def column_lemmatizer(data, column):
    tokenizer = RegexpTokenizer("[\w']+")
    lemmatizer = WordNetLemmatizer()
    new_column = []
    for i in data[column]:
        temp_list = []
        tokens = tokenizer.tokenize(i.lower().strip())
        for j in tokens:
            temp_list.append(lemmatizer.lemmatize(j))
        temp_list = ' '.join(temp_list)
        new_column.append(temp_list)
    return new_column

In [69]:
#lemmatizing text columns

df['all_text'] = column_lemmatizer(df, 'all_text')
df['title'] = column_lemmatizer(df, 'title')
df['selftext'] = column_lemmatizer(df, 'selftext')
df['all_text']

0       it ha never been more dangerous to live in a r...
1       the supreme test for trump supporter should th...
2       conservative on liberal subreddit ayeeeeee you...
3       are you willing to steal the highest office if...
4       the misreported nature of the new coronavirus ...
                              ...                        
3995    what if we fought war to win when i wa a young...
3996    well it is 2014 only 1004 day left for obama i...
3997    how doe r conservative feel about the negative...
3998    where i would like to see gop policy shift oka...
3999    finding an aca exemption i wa tooling around w...
Name: all_text, Length: 4000, dtype: object

In [71]:
X_alltext = df['all_text']
X_title = df['title']
X_selftext = df['selftext']
X_nums = df[['num_comments', 'score', 'days_old', 'log_all_text_length']]
y = df['is_liberal']

In [72]:
# train test split each data subset

X_train_alltext, X_test_alltext, y_train_alltext, y_test_alltext = train_test_split(X_alltext, y)

X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(X_title, y)

X_train_selftext, X_test_selftext, y_train_selftext, y_test_selftext = train_test_split(X_selftext, y)

## Scoring Functions

In [None]:
#baseline accuracy score

print ('Baseline Accuracy Score: ' + str(round(max(df['is_liberal'].value_counts(normalize=True)), 4)))

In [73]:
# creating model scoring function

def gridsearch_score(pipe_model, params, cv, xtrain, ytrain, xtest, ytest):
    
    grid = GridSearchCV(estimator = pipe_model,
                       param_grid = params,
                       cv = cv)
    
    grid.fit(xtrain, ytrain)
    
    baseline = round(max(df['is_liberal'].value_counts(normalize=True)), 4)
    train_score = grid.score(xtrain, ytrain)
    test_score = grid.score(xtest, ytest)
    best_params = grid.best_params_
    
    preds = grid.predict(xtest)
    
    tn, fp, fn, tp = confusion_matrix(ytest,
                                  preds).ravel()
    
    specificity = round(tn / (tn + fp), 4)
    sensitivity = round(tp / (tp + fn), 4)
    
    print("Baseline Accuracy: 0.5")
    print(f"Train Accuracy: {round(train_score, 4)}")
    print(f"Test Accuracy: {round(test_score, 4)}")
    print()
    print(f"True Positives: {tp}")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print()
    print(f"Specificity: {specificity}")
    print(f"Sensitivity: {sensitivity}")
    print()
    print(best_params)
    

In [79]:
# creating function that scores model on all three text subsets

def grid_scores_difftexts(pipe_model, params, cv):
    
    print('---------------------Title---------------------')
    gridsearch_score(pipe_model = pipe_model, 
                 params = params, 
                 cv = cv, 
                 xtrain = X_train_title,
                 ytrain = y_train_title,
                 xtest = X_test_title,
                 ytest = y_test_title)
    
    print("\n")
    print("\n")
    print('---------------------Self Text---------------------')
    gridsearch_score(pipe_model = pipe_model, 
                 params = params, 
                 cv = cv, 
                 xtrain = X_train_alltext,
                 ytrain = y_train_alltext,
                 xtest = X_test_alltext,
                 ytest = y_test_alltext)
    
    print("\n")
    print("\n")
    print('---------------------All Text---------------------')
    gridsearch_score(pipe_model = pipe_model, 
                 params = params, 
                 cv = cv, 
                 xtrain = X_train_alltext,
                 ytrain = y_train_alltext,
                 xtest = X_test_alltext,
                 ytest = y_test_alltext)

In [75]:
# turning off warnings for output formatting

import warnings
warnings.filterwarnings('ignore')

## Countvectorizer/LogisticRegression

In [80]:
# setting parameters, creating pipeline, and scoring model

params_cvec_ss_logr = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english'],
    'cvec__max_features' : [None, 200],
#     'logr__penalty' : ['none', 'l2'],
    'logr__C' : [0.0001, 0.001, 0.01],
    'ss__with_mean' : [False],
    'logr__max_iter' : [100, 500]
}

pipe_cvec_ss_logr = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler()),
    ('logr', LogisticRegression())
])

In [81]:
grid_scores_difftexts(pipe_model = pipe_cvec_ss_logr, params = params_cvec_ss_logr, cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9817
Test Accuracy: 0.639

True Positives: 393
True Negatives: 246
False Positives: 242
False Negatives: 119

Specificity: 0.5041
Sensitivity: 0.7676

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'logr__C': 0.0001, 'logr__max_iter': 100, 'ss__with_mean': False}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.983
Test Accuracy: 0.628

True Positives: 391
True Negatives: 237
False Positives: 263
False Negatives: 109

Specificity: 0.474
Sensitivity: 0.782

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'logr__C': 0.001, 'logr__max_iter': 100, 'ss__with_mean': False}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.983
Test Accuracy: 0.628

True Positives: 391
True Negatives: 237
False Positives: 263
False Negatives: 109

Speci

## TfidfVectorizer/LogisticRegression

In [82]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_logr = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english'],
    'tfidf__max_features' : [None, 200],
#     'logr__penalty' : ['none', 'l2'],
    'logr__C' : [0.0001, 0.001, 0.01],
    'logr__max_iter' : [100, 500]
}

pipe_tfidf_logr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logr', LogisticRegression())
])

In [83]:
grid_scores_difftexts(pipe_model = pipe_tfidf_logr, 
                 params = params_tfidf_logr, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.618
Test Accuracy: 0.59

True Positives: 139
True Negatives: 451
False Positives: 37
False Negatives: 373

Specificity: 0.9242
Sensitivity: 0.2715

{'logr__C': 0.01, 'logr__max_iter': 100, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.8887
Test Accuracy: 0.672

True Positives: 326
True Negatives: 346
False Positives: 154
False Negatives: 174

Specificity: 0.692
Sensitivity: 0.652

{'logr__C': 0.001, 'logr__max_iter': 100, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.8887
Test Accuracy: 0.672

True Positives: 326
True Negatives: 346
False Positives: 154
False Negatives: 174

Specificity: 0.692
Sensitivity: 0.652

{'logr__C':

## CountVectorizer/DecisionTree

In [88]:
# setting parameters, creating pipeline, and scoring model

params_cvec_dtree = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english'],
    'cvec__max_features' : [None, 200],
    'dtree__max_features' : [None, 500, 1000],
    'dtree__max_depth' : [None, 3, 10]
}

pipe_cvec_dtree = Pipeline([
    ('cvec', CountVectorizer()),
    ('dtree', DecisionTreeClassifier())
])

In [89]:
grid_scores_difftexts(pipe_model = pipe_cvec_dtree, 
                 params = params_cvec_dtree, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.994
Test Accuracy: 0.634

True Positives: 356
True Negatives: 278
False Positives: 210
False Negatives: 156

Specificity: 0.5697
Sensitivity: 0.6953

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'dtree__max_depth': None, 'dtree__max_features': None}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7317
Test Accuracy: 0.617

True Positives: 365
True Negatives: 252
False Positives: 248
False Negatives: 135

Specificity: 0.504
Sensitivity: 0.73

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'dtree__max_depth': 10, 'dtree__max_features': None}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7323
Test Accuracy: 0.614

True Positives: 364
True Negatives: 250
False Positives: 250
False Negatives: 136

Specificity: 0.5
Sensitivity

## TfidfVectorizer/DecisionTree

In [90]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_dtree = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english'],
    'tfidf__max_features' : [None, 200],
    'dtree__max_features' : [None, 500, 1000],
    'dtree__max_depth' : [None, 3, 10]
}

pipe_tfidf_dtree = Pipeline([
    ('tfidf', CountVectorizer()),
    ('dtree', DecisionTreeClassifier())
])

In [91]:
grid_scores_difftexts(pipe_model = pipe_tfidf_dtree, 
                 params = params_tfidf_dtree, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.994
Test Accuracy: 0.607

True Positives: 346
True Negatives: 261
False Positives: 227
False Negatives: 166

Specificity: 0.5348
Sensitivity: 0.6758

{'dtree__max_depth': None, 'dtree__max_features': 500, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.592

True Positives: 309
True Negatives: 283
False Positives: 217
False Negatives: 191

Specificity: 0.566
Sensitivity: 0.618

{'dtree__max_depth': None, 'dtree__max_features': None, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7313
Test Accuracy: 0.613

True Positives: 366
True Negatives: 247
False Positives: 253
False Negatives: 134

Specificity: 0.494
S

## CountVectorizer/KNN

In [96]:
# setting parameters, creating pipeline, and scoring model

params_cvec_ss_knn = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english'],
    'cvec__max_features' : [None, 200],
    'ss__with_mean' : [False],
    'knn__n_neighbors' : [2, 5, 10],
    'knn__metric' : ['manhattan', 'minkowski']
}

pipe_cvec_ss_knn = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

In [97]:
grid_scores_difftexts(pipe_model = pipe_cvec_ss_knn, 
                 params = params_cvec_ss_knn, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9277
Test Accuracy: 0.589

True Positives: 206
True Negatives: 383
False Positives: 105
False Negatives: 306

Specificity: 0.7848
Sensitivity: 0.4023

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'knn__metric': 'manhattan', 'knn__n_neighbors': 2, 'ss__with_mean': False}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.923
Test Accuracy: 0.573

True Positives: 406
True Negatives: 167
False Positives: 333
False Negatives: 94

Specificity: 0.334
Sensitivity: 0.812

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'knn__metric': 'minkowski', 'knn__n_neighbors': 2, 'ss__with_mean': False}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.923
Test Accuracy: 0.573

True Positives: 406
True Negatives: 167
False Positives: 333
False Neg

## TfidfVectorizer/KNN

In [103]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_knn = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english'],
    'tfidf__max_features' : [None, 200],
    'knn__n_neighbors' : [2, 5, 10],
    'knn__metric' : ['manhattan', 'minkowski']
}

pipe_tfidf_knn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [104]:
grid_scores_difftexts(pipe_model = pipe_tfidf_knn, 
                 params = params_tfidf_knn, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.6723
Test Accuracy: 0.569

True Positives: 344
True Negatives: 225
False Positives: 263
False Negatives: 168

Specificity: 0.4611
Sensitivity: 0.6719

{'knn__metric': 'manhattan', 'knn__n_neighbors': 5, 'tfidf__lowercase': False, 'tfidf__max_features': 200, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.733
Test Accuracy: 0.651

True Positives: 322
True Negatives: 329
False Positives: 171
False Negatives: 178

Specificity: 0.658
Sensitivity: 0.644

{'knn__metric': 'minkowski', 'knn__n_neighbors': 10, 'tfidf__lowercase': False, 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.733
Test Accuracy: 0.651

True Positives: 322
True Negatives: 329
False Positiv

## CountVectorizer/RandomForest

In [114]:
# setting parameters, creating pipeline, and scoring model

params_cvec_rfor = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english']
#     'cvec__max_features' : [None, 200],
#     'rfor__max_features' : [None, 500, 1000],
#     'rfor__max_depth' : [None, 3, 10],
#     'rfor__n_estimators' : [50, 100, 200]
}

pipe_cvec_rfor = Pipeline([
    ('cvec', CountVectorizer()),
    ('rfor', RandomForestClassifier())
])

In [115]:
grid_scores_difftexts(pipe_model = pipe_cvec_rfor, 
                 params = params_cvec_rfor, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.994
Test Accuracy: 0.647

True Positives: 415
True Negatives: 232
False Positives: 256
False Negatives: 97

Specificity: 0.4754
Sensitivity: 0.8105

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.658

True Positives: 339
True Negatives: 319
False Positives: 181
False Negatives: 161

Specificity: 0.638
Sensitivity: 0.678

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.667

True Positives: 351
True Negatives: 316
False Positives: 184
False Negatives: 149

Specificity: 0.632
Sensitivity: 0.702

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}


## TfidfVectorizer/RandomForest

In [116]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_rfor = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english'],
#     'tfidf__max_features' : [None, 200],
#     'rfor__max_features' : [None, 500, 1000],
#     'rfor__max_depth' : [None, 3, 10],
    'rfor__n_estimators' : [50, 100]
}

pipe_tfidf_rfor = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rfor', RandomForestClassifier())
])

In [117]:
grid_scores_difftexts(pipe_model = pipe_tfidf_rfor, 
                 params = params_tfidf_rfor, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.994
Test Accuracy: 0.651

True Positives: 381
True Negatives: 270
False Positives: 218
False Negatives: 131

Specificity: 0.5533
Sensitivity: 0.7441

{'rfor__n_estimators': 100, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.67

True Positives: 357
True Negatives: 313
False Positives: 187
False Negatives: 143

Specificity: 0.626
Sensitivity: 0.714

{'rfor__n_estimators': 100, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.641

True Positives: 326
True Negatives: 315
False Positives: 185
False Negatives: 174

Specificity: 0.63
Sensitivity: 0.652

{'rfor__n_estimators': 50, 'tfidf__low

## CountVectorizer/ExtraTrees

In [118]:
# setting parameters, creating pipeline, and scoring model

params_cvec_xtree = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english']
#     'xtree__max_features' : [None, 200],
#     'xtree__max_features' : [None, 500, 1000],
#     'xtree__max_depth' : [None, 3, 10],
#     'xtree__n_estimators' : [50, 100, 200]
}

pipe_cvec_xtree = Pipeline([
    ('cvec', CountVectorizer()),
    ('xtree', ExtraTreesClassifier())
])

In [119]:
grid_scores_difftexts(pipe_model = pipe_cvec_xtree, 
                 params = params_cvec_xtree, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.994
Test Accuracy: 0.659

True Positives: 380
True Negatives: 279
False Positives: 209
False Negatives: 132

Specificity: 0.5717
Sensitivity: 0.7422

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.645

True Positives: 322
True Negatives: 323
False Positives: 177
False Negatives: 178

Specificity: 0.646
Sensitivity: 0.644

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.654

True Positives: 322
True Negatives: 332
False Positives: 168
False Negatives: 178

Specificity: 0.664
Sensitivity: 0.644

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}


## TfidfVectorizer/ExtraTrees

In [120]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_xtree = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english']
#     'xtree__max_features' : [None, 200],
#     'xtree__max_features' : [None, 500, 1000],
#     'xtree__max_depth' : [None, 3, 10],
#     'xtree__n_estimators' : [50, 100, 200]
}

pipe_tfidf_xtree = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('xtree', ExtraTreesClassifier())
])

In [121]:
grid_scores_difftexts(pipe_model = pipe_tfidf_xtree, 
                 params = params_tfidf_xtree, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.994
Test Accuracy: 0.661

True Positives: 377
True Negatives: 284
False Positives: 204
False Negatives: 135

Specificity: 0.582
Sensitivity: 0.7363

{'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.666

True Positives: 344
True Negatives: 322
False Positives: 178
False Negatives: 156

Specificity: 0.644
Sensitivity: 0.688

{'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9997
Test Accuracy: 0.671

True Positives: 347
True Negatives: 324
False Positives: 176
False Negatives: 153

Specificity: 0.648
Sensitivity: 0.694

{'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}


## CountVectorizer/BaggingClassifier

In [123]:
# setting parameters, creating pipeline, and scoring model

params_cvec_bag = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english']
#     'bag__max_features' : [None, 200],
#     'bag__max_features' : [None, 500, 1000],
#     'bag__max_depth' : [None, 3, 10],
#     'bag__n_estimators' : [50, 100, 200]
}

pipe_cvec_bag = Pipeline([
    ('cvec', CountVectorizer()),
    ('bag', BaggingClassifier())
])

In [124]:
grid_scores_difftexts(pipe_model = pipe_cvec_bag, 
                 params = params_cvec_bag, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.975
Test Accuracy: 0.631

True Positives: 357
True Negatives: 274
False Positives: 214
False Negatives: 155

Specificity: 0.5615
Sensitivity: 0.6973

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9783
Test Accuracy: 0.631

True Positives: 313
True Negatives: 318
False Positives: 182
False Negatives: 187

Specificity: 0.636
Sensitivity: 0.626

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.981
Test Accuracy: 0.617

True Positives: 321
True Negatives: 296
False Positives: 204
False Negatives: 179

Specificity: 0.592
Sensitivity: 0.642

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}


## TfidfVectorizer/BaggingClassifier

In [126]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_bag = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english']
#     'bag__max_features' : [None, 200],
#     'bag__max_features' : [None, 500, 1000],
#     'bag__max_depth' : [None, 3, 10],
#     'bag__n_estimators' : [50, 100, 200]
}

pipe_tfidf_bag = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('bag', BaggingClassifier())
])

In [127]:
grid_scores_difftexts(pipe_model = pipe_tfidf_bag, 
                 params = params_tfidf_bag, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.97
Test Accuracy: 0.619

True Positives: 347
True Negatives: 272
False Positives: 216
False Negatives: 165

Specificity: 0.5574
Sensitivity: 0.6777

{'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9847
Test Accuracy: 0.623

True Positives: 306
True Negatives: 317
False Positives: 183
False Negatives: 194

Specificity: 0.634
Sensitivity: 0.612

{'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.982
Test Accuracy: 0.609

True Positives: 284
True Negatives: 325
False Positives: 175
False Negatives: 216

Specificity: 0.65
Sensitivity: 0.568

{'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}


## CountVectorizer/AdaBoost

In [128]:
# setting parameters, creating pipeline, and scoring model

params_cvec_ada = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english']
#     'ada__max_features' : [None, 200],
#     'ada__max_features' : [None, 500, 1000],
#     'ada__max_depth' : [None, 3, 10],
#     'ada__n_estimators' : [100, 1000]
}

pipe_cvec_ada = Pipeline([
    ('cvec', CountVectorizer()),
    ('ada', AdaBoostClassifier())
])

In [129]:
grid_scores_difftexts(pipe_model = pipe_cvec_ada, 
                 params = params_cvec_ada, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.6363
Test Accuracy: 0.608

True Positives: 451
True Negatives: 157
False Positives: 331
False Negatives: 61

Specificity: 0.3217
Sensitivity: 0.8809

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7167
Test Accuracy: 0.651

True Positives: 365
True Negatives: 286
False Positives: 214
False Negatives: 135

Specificity: 0.572
Sensitivity: 0.73

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7167
Test Accuracy: 0.651

True Positives: 365
True Negatives: 286
False Positives: 214
False Negatives: 135

Specificity: 0.572
Sensitivity: 0.73

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}


## TfidfVectorizer/AdaBoost

In [136]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_ada = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english'],
#     'ada__max_features' : [None, 200],
#     'ada__max_features' : [None, 500, 1000],
#     'ada__max_depth' : [None, 3, 10],
    'ada__n_estimators' : [100, 200]
}

pipe_tfidf_ada = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('ada', AdaBoostClassifier())
])

In [137]:
grid_scores_difftexts(pipe_model = pipe_tfidf_ada, 
                 params = params_tfidf_ada, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.6867
Test Accuracy: 0.596

True Positives: 420
True Negatives: 176
False Positives: 312
False Negatives: 92

Specificity: 0.3607
Sensitivity: 0.8203

{'ada__n_estimators': 100, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7887
Test Accuracy: 0.651

True Positives: 358
True Negatives: 293
False Positives: 207
False Negatives: 142

Specificity: 0.586
Sensitivity: 0.716

{'ada__n_estimators': 100, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7887
Test Accuracy: 0.651

True Positives: 358
True Negatives: 293
False Positives: 207
False Negatives: 142

Specificity: 0.586
Sensitivity: 0.716

{'ada__n_estimators': 100, 'tfidf__low

## CountVectorizer/KernelSVM

In [148]:
# setting parameters, creating pipeline, and scoring model

params_cvec_ss_svm = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english'],
    'ss__with_mean' : [False],
    'svc__C' : [5, 10, 15]
}

pipe_cvec_ss_svm = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler()),
    ('svc', SVC())
])

In [149]:
grid_scores_difftexts(pipe_model = pipe_cvec_ss_svm, 
                 params = params_cvec_ss_svm, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9637
Test Accuracy: 0.624

True Positives: 354
True Negatives: 270
False Positives: 218
False Negatives: 158

Specificity: 0.5533
Sensitivity: 0.6914

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'ss__with_mean': False, 'svc__C': 5}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9847
Test Accuracy: 0.591

True Positives: 346
True Negatives: 245
False Positives: 255
False Negatives: 154

Specificity: 0.49
Sensitivity: 0.692

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'ss__with_mean': False, 'svc__C': 10}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9847
Test Accuracy: 0.591

True Positives: 346
True Negatives: 245
False Positives: 255
False Negatives: 154

Specificity: 0.49
Sensitivity: 0.692

{'cvec__lowercase': Fa

## TfidfVectorizer/KernelSVM

In [152]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_svm = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english'],
    'svc__C' : [0.1, 1, 10]
}

pipe_tfidf_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC())
])

In [153]:
grid_scores_difftexts(pipe_model = pipe_tfidf_svm, 
                 params = params_tfidf_svm, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.994
Test Accuracy: 0.654

True Positives: 339
True Negatives: 315
False Positives: 173
False Negatives: 173

Specificity: 0.6455
Sensitivity: 0.6621

{'svc__C': 10, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.999
Test Accuracy: 0.693

True Positives: 342
True Negatives: 351
False Positives: 149
False Negatives: 158

Specificity: 0.702
Sensitivity: 0.684

{'svc__C': 1, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.999
Test Accuracy: 0.693

True Positives: 342
True Negatives: 351
False Positives: 149
False Negatives: 158

Specificity: 0.702
Sensitivity: 0.684

{'svc__C': 1, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1

## Custom SVM

## Combined Text and Numeric Scaled Data

In [199]:
# instantiatind tfidf and standard scaler

tfidf = TfidfVectorizer(lowercase = False, 
                        ngram_range= (1,2), 
                        stop_words='english',
                        max_features = 20_000
                       )

ss = StandardScaler()

In [200]:
# transforming text with tfidf and scaling numeric data separately

X_alltext_tfidf = pd.DataFrame(tfidf.fit_transform(X_alltext).todense())
X_nums_ss = pd.DataFrame(ss.fit_transform(X_nums))

In [201]:
# recombining features

X_combined = pd.concat([X_alltext_tfidf, X_nums_ss], axis=1)

In [202]:
X_combined

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19994,19995,19996,19997,19998,19999,0.1,1.1,2.1,3.1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.023769,-0.166980,-1.394170,0.014839
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.108492,-0.166980,-1.394170,-0.797496
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.620128,-0.166980,-1.394170,-0.425001
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.260994,-0.166980,-1.395813,0.407228
4,0.0,0.0,0.0,0.0,0.0,0.0,0.036584,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.416792,-0.166980,-1.395813,2.607957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.193216,0.014234,2.343613,1.693465
3996,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.260994,-0.179061,2.341970,-1.028203
3997,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.145678,-0.034089,2.341970,0.382072
3998,0.0,0.0,0.0,0.0,0.0,0.0,0.162015,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.260994,-0.166980,2.341970,0.949919


In [203]:
# creating column names from features

features = tfidf.get_feature_names() + ['num_comments', 'score', 'days_old', 'log_all_text_length']

In [204]:
features[::-1]

['log_all_text_length',
 'days_old',
 'score',
 'num_comments',
 'ᴛᴏ',
 'ᴛʜᴇ',
 'zy 2jihqeg4gwhnbieym',
 'zy',
 'zuckerberg',
 'zpgcrsg5de1xali3ftuh3fcnculuzqbzn0l5incqqmu',
 'zone',
 'zionist',
 'zionism',
 'ziegler',
 'zero',
 'zellerbach hall',
 'zellerbach',
 'zelenskiy',
 'zeid',
 'zealand',
 'zakaria',
 'youtubers',
 'youtuber',
 'youtube video',
 'youtube http',
 'youtube com',
 'youtube channel',
 'youtube',
 'youtu',
 'youth',
 'youre',
 'younger people',
 'younger generation',
 'younger',
 'young woman',
 'young turk',
 'young people',
 'young men',
 'young man',
 'young liberal',
 'young conservative',
 'young age',
 'young',
 'yorkers',
 'york time',
 'york city',
 'york 202',
 'york',
 'yingyangapp yingyang',
 'yingyangapp',
 'yingyang',
 'yield',
 'yiannopoulos shut',
 'yiannopoulos',
 'yesterday',
 'yes know',
 'yes',
 'yep',
 'yen',
 'yemen',
 'yellow',
 'yelling',
 'yell',
 'yee',
 'years',
 'yearning breathe',
 'yearning',
 'year year',
 'year week',
 'year wa',
 'yea

In [205]:
X_combined.columns = features

In [206]:
X_combined

Unnamed: 0,00,00 hour,00 joe,00 jose,00 month,00 year,000,000 00,000 000,000 death,...,zpgcrsg5de1xali3ftuh3fcnculuzqbzn0l5incqqmu,zuckerberg,zy,zy 2jihqeg4gwhnbieym,ᴛʜᴇ,ᴛᴏ,num_comments,score,days_old,log_all_text_length
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.023769,-0.166980,-1.394170,0.014839
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.108492,-0.166980,-1.394170,-0.797496
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.620128,-0.166980,-1.394170,-0.425001
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.260994,-0.166980,-1.395813,0.407228
4,0.0,0.0,0.0,0.0,0.0,0.0,0.036584,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.416792,-0.166980,-1.395813,2.607957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.193216,0.014234,2.343613,1.693465
3996,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.260994,-0.179061,2.341970,-1.028203
3997,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.145678,-0.034089,2.341970,0.382072
3998,0.0,0.0,0.0,0.0,0.0,0.0,0.162015,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.260994,-0.166980,2.341970,0.949919


In [207]:
# train test split on combined data

X_combined_train, X_combined_test, y_combined_train, y_combined_test = train_test_split(X_combined, y)

In [208]:
#instantiating model, fitting, and scoring

svc_custom = SVC()

In [209]:
svc_custom.fit(X_combined_train, y_combined_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [210]:
train_score_svc = svc_custom.score(X_combined_train, y_combined_train)
test_score_svc = svc_custom.score(X_combined_test, y_combined_test)

In [211]:
preds = svc_custom.predict(X_combined_test)

In [212]:
tn, fp, fn, tp = confusion_matrix(y_combined_test,
                                  preds).ravel()

In [213]:
specificity = round(tn / (tn + fp), 4)
sensitivity = round(tp / (tp + fn), 4)

In [214]:
print("Baseline Accuracy: 0.5")
print(f"Train Accuracy: {round(train_score_svc, 4)}")
print(f"Test Accuracy: {round(test_score_svc, 4)}")
print()
print(f"True Positives: {tp}")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print()
print(f"Specificity: {specificity}")
print(f"Sensitivity: {sensitivity}")

Baseline Accuracy: 0.5
Train Accuracy: 0.827
Test Accuracy: 0.783

True Positives: 433
True Negatives: 350
False Positives: 163
False Negatives: 54

Specificity: 0.6823
Sensitivity: 0.8891


## CountVectorizer/BernoulliNB

In [195]:
# setting parameters, creating pipeline, and scoring model

params_cvec_ss_nb = {
    'cvec__ngram_range' : [(1, 1), (1, 2)],
    'cvec__lowercase' : [ False],
    'cvec__stop_words' : ['english'],
    'ss__with_mean' : [False],
    'nb__alpha' : [0.001, 0.01]
}

pipe_cvec_ss_nb = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler()),
    ('nb', BernoulliNB())
])

In [196]:
grid_scores_difftexts(pipe_model = pipe_cvec_ss_nb, 
                 params = params_cvec_ss_nb, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9823
Test Accuracy: 0.619

True Positives: 368
True Negatives: 251
False Positives: 237
False Negatives: 144

Specificity: 0.5143
Sensitivity: 0.7188

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'nb__alpha': 0.001, 'ss__with_mean': False}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9667
Test Accuracy: 0.594

True Positives: 436
True Negatives: 158
False Positives: 342
False Negatives: 64

Specificity: 0.316
Sensitivity: 0.872

{'cvec__lowercase': False, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'nb__alpha': 0.001, 'ss__with_mean': False}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.9667
Test Accuracy: 0.594

True Positives: 436
True Negatives: 158
False Positives: 342
False Negatives: 64

Specificity: 0.316
Sensitivity: 0.872

{'cvec__l

 ## TfidfVectorizer/BernoulliNB

In [217]:
# setting parameters, creating pipeline, and scoring model

params_tfidf_nb = {
    'tfidf__ngram_range' : [(1, 1), (1, 2)],
    'tfidf__lowercase' : [ False],
    'tfidf__stop_words' : ['english'],
    'nb__alpha' : [0.01, 1, 10]
}

pipe_tfidf_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', BernoulliNB())
])

In [218]:
grid_scores_difftexts(pipe_model = pipe_tfidf_nb, 
                 params = params_tfidf_nb, 
                 cv = 5)

---------------------Title---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7743
Test Accuracy: 0.618

True Positives: 389
True Negatives: 229
False Positives: 259
False Negatives: 123

Specificity: 0.4693
Sensitivity: 0.7598

{'nb__alpha': 10, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------Self Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7897
Test Accuracy: 0.585

True Positives: 393
True Negatives: 192
False Positives: 308
False Negatives: 107

Specificity: 0.384
Sensitivity: 0.786

{'nb__alpha': 0.01, 'tfidf__lowercase': False, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}




---------------------All Text---------------------
Baseline Accuracy: 0.5
Train Accuracy: 0.7897
Test Accuracy: 0.585

True Positives: 393
True Negatives: 192
False Positives: 308
False Negatives: 107

Specificity: 0.384
Sensitivity: 0.786

{'nb__alpha': 0.01, 'tfidf__lowercase': False, 'tfidf