In [15]:
# # ! pip install imbalanced-learn
# # ! pip install lime
# # ! pip install textblob
# # ! pip install contractions
# # ! pip install spacy
# # ! python -m spacy download en_core_web_sm
# # ! pip install gensim
# ! pip install python-Levenshtein

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns

import pickle 
import imblearn
from imblearn.over_sampling import SMOTE
from collections import Counter
import time

import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
# import lime
# import lime.lime_tabular

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, plot_confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, IncrementalPCA, LatentDirichletAllocation

from helpers import *

## Load cleaned data
combine title and body of text 

In [2]:
aita_2 = pd.read_csv('data/aita_save2.csv')
aita_2.dropna(subset=['body_clean', 'title_clean'], inplace=True)

aita = aita_2.loc[:,['body_polarity','body_subjectivity','is_asshole']]
aita['title_body_combo'] =  aita_2['title_clean'] + aita_2['body_clean']
aita.dropna(inplace=True)
aita.reset_index(drop=True)

Unnamed: 0,body_polarity,body_subjectivity,is_asshole,title_body_combo
0,-0.156818,0.656818,1,I write an explanation in til and come off a...
1,0.034848,0.449242,1,throw my parent donut awaymy parent be diabe...
2,0.000000,0.000000,0,I tell a goth girl she look like a clownI be four
3,0.000000,0.000000,1,argument I have with another redditor in rhi...
4,0.040104,0.369792,1,have a disagreement about le miserable with ...
...,...,...,...,...
97450,0.016111,0.385278,0,for tell my sister she be be a spoiled bratm...
97451,0.068461,0.474614,0,for tell my husband to f off after he relent...
97452,-0.076333,0.493467,0,for attempt to keep my student out of adult ...
97453,0.067130,0.426132,0,if I leave my brother fate up to the statea ...


## 1 - train test split

In [3]:
X = aita['title_body_combo']
y = aita['is_asshole']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.01, random_state=21)
print(X_train.shape, X_test.shape)

(96480,) (975,)


In [4]:
Counter(y)

Counter({1: 26446, 0: 71009})

In [None]:
lr = pickle.load(open('models/lr.sav', 'rb'))
cv = pickle.load(open('models/cv_fit_train.sav', 'rb'))
pca = pickle.load(open('models/pca_combo.sav', 'rb'))

In [None]:
Proctologist()
X_test

## 2 - GridSearchCV

Basic Naive Bayes model scores for comparison:

#### Train set:
F1 Score = 0.5855892851502037<br>
Accuracy Score = 0.728047263681592
#### Test set
F1 Score = 0.5855892851502037<br>
Accuracy Score = 0.728047263681592

In [6]:
add_stop_words = ['like', 'get', 'go', 'say', 'tell', 'thats', 'want', 'time',
                  'youre', 'got', 'gonna', 'time', 'also', 'yeah', 'said']
stop = set(stopwords.words('english')).union(add_stop_words)

### 2.1 Support Vector Machines (SVM)

In [7]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=stop, ngram_range=(1, 2), min_df=0.005, max_df=0.8)),
                     ('pca', PCA()),
                     ('sampling', SMOTE()),
                     ('clf', SGDClassifier())
                    ])
params = {'clf__alpha': [(1e-2)],
          'pca__n_components' : [800, 1000]
         }
gs_clf = GridSearchCV(text_clf, params, n_jobs=-1, scoring='f1')

In [8]:
start = time.time()
gs_clf.fit(X_train, y_train)
end = time.time()
print(f'run time = {(end-start)/60} mins')



TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.

In [None]:
sum(gs_clf.best_estimator_['pca'].explained_variance_ratio_)

In [None]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
y_pred_train = gs_clf.predict(X_train)
print(confusion_matrix(y_train,y_pred_train))
print(f'F1 Score = {f1_score(y_train,y_pred_train)}')
print(f'Accuracy Score = {accuracy_score(y_train,y_pred_train)}')

In [None]:
y_pred_test = gs_clf.predict(X_test)
print(confusion_matrix(y_test,y_pred_test))
print(f'F1 Score = {f1_score(y_test,y_pred_test)}')
print(f'Accuracy Score = {accuracy_score(y_test,y_pred_test)}')

In [24]:
y_pred_train = text_clf.predict(X_train)
print(confusion_matrix(y_train,y_pred_train))
print(f'F1 Score = {f1_score(y_train,y_pred_train)}')
print(f'Accuracy Score = {accuracy_score(y_train,y_pred_train)}')

[[51704 18595]
 [ 7643 18538]]
F1 Score = 0.5855892851502037
Accuracy Score = 0.728047263681592


In [25]:
y_pred_test = text_clf.predict(X_test)
print(confusion_matrix(y_test,y_pred_test))
print(f'F1 Score = {f1_score(y_test,y_pred_test)}')
print(f'Accuracy Score = {accuracy_score(y_test,y_pred_test)}')

[[458 252]
 [142 123]]
F1 Score = 0.3843750000000001
Accuracy Score = 0.5958974358974359
