In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns

import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import pickle 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, IncrementalPCA

In [8]:
aita_2 = pd.read_csv('data/aita_save2.csv')
aita = aita_2.loc[:,['title_polarity','title_subjectivity','title_clean','is_asshole']]
aita.dropna(inplace=True)
aita.reset_index(drop=True)

Unnamed: 0,title_polarity,title_subjectivity,title_clean,is_asshole
0,0.0,0.0,I write an explanation in til and come off a...,1
1,0.0,0.0,throw my parent donut away,1
2,0.0,0.0,I tell a goth girl she look like a clown,0
3,0.0,0.0,argument I have with another redditor in rhimym,1
4,0.0,0.0,have a disagreement about le miserable with ...,1
...,...,...,...,...
97493,0.0,0.0,for tell my sister she be be a spoiled brat,0
97494,0.0,0.0,for tell my husband to f off after he relent...,0
97495,0.1,0.3,for attempt to keep my student out of adult ...,0
97496,0.0,0.0,if I leave my brother fate up to the state,0


## 1 - Save features X and labels y, train test split

In [9]:
X = aita.drop('is_asshole', axis=1)
y = aita['is_asshole']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.01, random_state=11)

In [10]:
X_train.shape, X_test.shape

((96523, 3), (975, 3))

## 2 - Instantiate CountVectorizer 

In [None]:
stop = set(stopwords.words('english'))
vec = CountVectorizer(stop_words=stop, min_df=25, max_df=0.8, ngram_range=(1, 2))
cv = vec.fit(X_train['title_clean'])

In [None]:
len(vec.get_feature_names())

#### Save fitted CountVectorizer to use later

In [11]:
# pickle.dump(cv, open('models/cv_fit_train.sav', 'wb'))

#### Load fitted CountVectorizer

In [6]:
# cv = pickle.load(open('models/cv_fit_train.sav', 'rb'))

### 2.a Transform train and test data into document-term-matrix with CountVectorizer

In [16]:
cv_train = cv.transform(X_train['title_clean'])
cv_test = cv.transform(X_test['title_clean'])

In [17]:
cv_cols = cv.get_feature_names()
dtm_train = pd.DataFrame(cv_train.toarray(), columns=cv_cols)
dtm_test = pd.DataFrame(cv_test.toarray(), columns=cv_cols)

In [18]:
train_word_count = dtm_train.sum(axis=0)
train_word_count.sort_values(ascending=False)[:10]

friend        16032
want          12914
tell          11576
get            7805
girlfriend     6115
ask            5764
go             5602
boyfriend      5065
sister         4131
mom            3904
dtype: int64

In [19]:
test_word_count = dtm_test.sum(axis=0)
test_word_count.sort_values(ascending=False)[:10]

friend        167
want          144
tell          103
get            78
girlfriend     69
ask            56
go             53
sister         49
boyfriend      46
give           44
dtype: int64

In [20]:
# ((96480, 44051), (975, 44051))
dtm_train.shape, dtm_test.shape

((96523, 7870), (975, 7870))

### Save / Load Document Term Matrix to / from csv

In [25]:
# dtm_train.to_csv('data/dtm_train.csv', index=False)
# dtm_test.to_csv('data/dtm_test.csv', index=False)

# dtm_train = pd.read_csv('data/dtm_train.csv')
# dtm_test = pd.read_csv('data/dtm_test.csv')

## 3 - PCA

In [None]:
pca = IncrementalPCA(n_components=800, batch_size=850)
pca.fit(dtm_train)
print(pca.explained_variance_ratio_.sum())

### 3.a Scree plot, check number of components is appropriate

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.title('PCA Scree Plot')
# plt.axvline(linewidth=4, color='r', linestyle = '--', x=10, ymin=0, ymax=1)
evr = pca.explained_variance_ratio_
cvr = np.cumsum(pca.explained_variance_ratio_)
pca_df = pd.DataFrame()
pca_df['Cumulative Variance Ratio'] = cvr
pca_df['Explained Variance Ratio'] = evr
# display(pca_df.head(10))

#### Save / Load fitted PCA Model

In [41]:
pickle.dump(pca, open('models/pca_title.sav', 'wb'))
# pca = pickle.load(open('models/pca.sav', 'rb'))
# pca

### 3.b - Transform train/test Document-Term-matrix with PCA

In [43]:
dtm_train_pca = pca.transform(dtm_train)
dtm_test_pca = pca.transform(dtm_test)

Build DataFrame of PCA Components

In [44]:
col_names = ['PC_' + str(i) for i in range(1, pca.get_params()['n_components']+1)]
dtm_train_pca_df = pd.DataFrame(dtm_train_pca, columns=col_names)
dtm_test_pca_df = pd.DataFrame(dtm_test_pca, columns=col_names)

Combine PCA components and sentiment analysis scores into one DataFrame

In [45]:
X_train_post_dtm_pca = pd.concat([X_train[['body_polarity','body_subjectivity']].reset_index(drop=True), dtm_train_pca_df], axis=1)
X_test_post_dtm_pca = pd.concat([X_test[['body_polarity','body_subjectivity']].reset_index(drop=True),dtm_test_pca_df], axis=1)

In [46]:
X_train_post_dtm_pca.head()

Unnamed: 0,body_polarity,body_subjectivity,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,...,PC_791,PC_792,PC_793,PC_794,PC_795,PC_796,PC_797,PC_798,PC_799,PC_800
0,0.089698,0.382755,6.721062,3.945255,-3.471649,-2.540817,-1.75577,-1.530562,-4.131972,3.995945,...,0.090375,-0.634156,-0.356446,0.23528,0.338331,-0.50631,0.407767,-0.156681,0.328248,-0.305217
1,-0.14,0.601585,5.761561,0.596863,-2.237171,-3.185759,-1.131884,-2.2626,-2.888938,0.091418,...,-0.521899,-0.763363,0.620522,0.318312,0.675178,0.760419,0.822378,-0.192563,0.465362,1.029007
2,0.034758,0.575265,1.966241,-1.712645,-0.909865,0.264539,0.921549,5.895036,1.228376,-1.104059,...,-0.016981,-0.175593,0.031395,0.089073,-0.180081,0.025352,0.025359,-0.034695,0.128525,0.134961
3,-0.00947,0.545676,-0.266427,0.267081,2.156305,-2.30499,-2.70524,-2.705795,-0.498179,0.245619,...,-0.440284,-0.017129,0.046089,-0.495012,-0.651864,0.408607,0.301299,0.130943,-0.954164,0.605056
4,0.196644,0.494792,2.39838,-0.615288,-0.211554,-3.215119,-1.241756,1.164878,4.107588,2.69878,...,-0.076834,0.109758,-0.102025,-0.278899,0.029033,-0.506665,0.143349,-0.365991,-0.151719,0.129302


In [47]:
print(X_train_post_dtm_pca.shape, y_train.shape)
print(X_test_post_dtm_pca.shape, y_test.shape)

(96480, 802) (96480,)
(975, 802) (975,)


# Models

## 1 - Random Forest Models

### 1.a Vanilla Random Forest, no tuning hyperparameters 

In [49]:
rfc = RandomForestClassifier()
rfc.fit(X_train_post_dtm_pca, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [53]:
cross_val_score(rfc, X_train_post_dtm_pca, y_train, cv=5, scoring='f1')

array([0.01932008, 0.01416853, 0.00824279, 0.01418704, 0.0134882 ])

In [54]:
print(f'Testing Accuracy = {rfc.score(X_test_post_dtm_pca, y_test)}')

Testing Accuracy = 0.7271794871794872


In [97]:
y_pred_train = rfc.predict(X_train_post_dtm_pca)
print(confusion_matrix(y_train,y_pred_train))
# print(classification_report(y_train,y_pred_train))
print(f'F1 Score = {f1_score(y_train,y_pred_train)}')
print(f'Accuracy Score = {accuracy_score(y_train,y_pred_train)}')

[[70299     0]
 [    0 26181]]
F1 Score = 1.0
Accuracy Score = 1.0


In [96]:
y_pred_test = rfc.predict(X_test_post_dtm_pca)
print(confusion_matrix(y_test,y_pred_test))
# print(classification_report(y_test,y_pred_test))
print(f'F1 Score = {f1_score(y_test,y_pred_test)}')
print(f'Accuracy Score = {accuracy_score(y_test,y_pred_test)}')

[[707   3]
 [263   2]]
F1 Score = 0.014814814814814815
Accuracy Score = 0.7271794871794872


### 1.b weighted random forest

In [61]:
rfc2 = RandomForestClassifier(class_weight="balanced")
rfc2.fit(X_train_post_dtm_pca, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [94]:
y_pred_train_2 = rfc2.predict(X_train_post_dtm_pca)
print(confusion_matrix(y_train,y_pred_train_2))
# print(classification_report(y_train,y_pred_train_2))
print(f'F1 Score = {f1_score(y_train,y_pred_train_2)}')
print(f'Accuracy Score = {accuracy_score(y_train,y_pred_train_2)}')

[[70299     0]
 [    0 26181]]
F1 Score = 1.0
Accuracy Score = 1.0


In [93]:
y_pred_test_2 = rfc2.predict(X_test_post_dtm_pca)
print(confusion_matrix(y_test,y_pred_test_2))
# print(classification_report(y_test,y_pred_test_2))
print(f'F1 Score = {f1_score(y_test,y_pred_test_2)}')
print(f'Accuracy Score = {accuracy_score(y_test,y_pred_test_2)}')

[[710   0]
 [264   1]]
F1 Score = 0.0075187969924812035
Accuracy Score = 0.7292307692307692


## 2 - Logistic Regression

In [68]:
lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train_post_dtm_pca, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [98]:
y_pred_train_lr = lr.predict(X_train_post_dtm_pca)
print(confusion_matrix(y_train,y_pred_train_lr))
# print(classification_report(y_train,y_pred_train_lr))
print(f'F1 Score = {f1_score(y_train,y_pred_train_lr)}')
print(f'Accuracy Score = {accuracy_score(y_train,y_pred_train_lr)}')

[[41623 28676]
 [ 9777 16404]]
F1 Score = 0.460392079819256
Accuracy Score = 0.6014407131011609


In [99]:
y_pred_test_lr = lr.predict(X_test_post_dtm_pca)
print(confusion_matrix(y_test,y_pred_test_lr))
# print(classification_report(y_test,y_pred_test_lr))
print(f'F1 Score = {f1_score(y_test,y_pred_test_lr)}')
print(f'Accuracy Score = {accuracy_score(y_test,y_pred_test_lr)}')

[[415 295]
 [104 161]]
F1 Score = 0.4466019417475728
Accuracy Score = 0.5907692307692308


## 3. Naive Bayes

In [82]:
nb = MultinomialNB()
nb.fit(dtm_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [100]:
y_pred_train_nb = nb.predict(dtm_train)
print(confusion_matrix(y_train,y_pred_train_nb))
# print(classification_report(y_train,y_pred_train_nb))
print(f'F1 Score = {f1_score(y_train,y_pred_train_nb)}')
print(f'Accuracy Score = {accuracy_score(y_train,y_pred_train_nb)}')

[[46942 23357]
 [12704 13477]]
F1 Score = 0.4277394271205268
Accuracy Score = 0.626233416252073


In [101]:
y_pred_test_nb = nb.predict(dtm_test)
print(confusion_matrix(y_test,y_pred_test_nb))
# print(classification_report(y_test,y_pred_test_nb))
print(f'F1 Score = {f1_score(y_test,y_pred_test_nb)}')
print(f'Accuracy Score = {accuracy_score(y_test,y_pred_test_nb)}')

[[462 248]
 [139 126]]
F1 Score = 0.39436619718309857
Accuracy Score = 0.6030769230769231
