In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
from nltk.corpus import stopwords
import pickle 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, IncrementalPCA

In [2]:
aita_2 = pd.read_csv('data/aita_save2.csv')
aita_2.dropna(subset=['body_clean', 'title_clean'], inplace=True)

aita = aita_2[['body_polarity','body_subjectivity','is_asshole']]
aita['title_body_combo'] =  aita_2['title_clean'] + aita_2['body_clean']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aita['title_body_combo'] =  aita_2['title_clean'] + aita_2['body_clean']


In [3]:
aita

Unnamed: 0,body_polarity,body_subjectivity,is_asshole,title_body_combo
0,-0.156818,0.656818,1,I write an explanation in til and come off a...
1,0.034848,0.449242,1,throw my parent donut awaymy parent be diabe...
2,0.000000,0.000000,0,I tell a goth girl she look like a clownI be four
3,0.000000,0.000000,1,argument I have with another redditor in rhi...
4,0.040104,0.369792,1,have a disagreement about le miserable with ...
...,...,...,...,...
97536,0.016111,0.385278,0,for tell my sister she be be a spoiled bratm...
97537,0.068461,0.474614,0,for tell my husband to f off after he relent...
97538,-0.076333,0.493467,0,for attempt to keep my student out of adult ...
97539,0.067130,0.426132,0,if I leave my brother fate up to the statea ...


In [4]:
X = aita.drop('is_asshole', axis=1)
y = aita['is_asshole']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.01, random_state=11)

In [5]:
X_train.shape, X_test.shape

((96480, 3), (975, 3))

## Instantiate CountVectorizer 

In [5]:
# stop = set(stopwords.words('english'))
# vec = CountVectorizer(stop_words=stop, min_df=50, max_df=0.8, ngram_range=(1, 3))
# cv = vec.fit(X_train['title_body_combo'])

### Save fitted CountVectorizer to use later

In [102]:
# pickle.dump(cv, open('models/cv_fit_train.sav', 'wb'))
# len(vec.get_feature_names())

44051

### Load fitted CountVectorizer

In [6]:
cv = pickle.load(open('models/cv_fit_train.sav', 'rb'))

In [7]:
cv_train = cv.transform(X_train['title_body_combo'])
cv_test = cv.transform(X_test['title_body_combo'])

In [11]:
dtm_train = pd.DataFrame(cv_train.toarray(), columns=cv.get_feature_names())
train_word_count = dtm_train.sum(axis=0)
train_word_count.sort_values(ascending=False)[:10]

get       228922
go        198113
say       197408
tell      169258
would     165609
want      159291
like      139417
friend    132208
time      122214
know      108087
dtype: int64

In [12]:
dtm_test = pd.DataFrame(cv_test.toarray(), columns=cv.get_feature_names())
test_word_count = dtm_test.sum(axis=0)
test_word_count.sort_values(ascending=False)[:10]

get       2354
say       1991
go        1967
tell      1704
want      1680
would     1608
like      1352
friend    1291
time      1262
know      1147
dtype: int64

In [13]:
dtm_train.shape, dtm_test.shape

((96480, 44051), (975, 44051))

### Save / Load Document Term Matrix to / from csv

In [109]:
# dtm_train.to_csv('data/dtm_train.csv', index=False)
# dtm_test.to_csv('data/dtm_test.csv', index=False)

dtm_train = pd.read_csv('data/dtm_train.csv')
dtm_test = pd.read_csv('data/dtm_test.csv')

## PCA

In [None]:
pca = IncrementalPCA(n_components=800, batch_size=850)
pca.fit(dtm_train)
print(pca.explained_variance_ratio_.sum())

### Save / Load fitted PCA Model

In [None]:
pickle.dump(pca, open('models/pca.sav', 'wb'))

# pca = pickle.load(open('models/pca.sav', 'rb'))
# pca

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.title('PCA Scree Plot')
# plt.axvline(linewidth=4, color='r', linestyle = '--', x=10, ymin=0, ymax=1)
evr = pca.explained_variance_ratio_
cvr = np.cumsum(pca.explained_variance_ratio_)
pca_df = pd.DataFrame()
pca_df['Cumulative Variance Ratio'] = cvr
pca_df['Explained Variance Ratio'] = evr
# display(pca_df.head(10))

### Transform train and test Document-Term-matrix with PCA

In [44]:
dtm_train_pca = pca.transform(dtm_train)
dtm_test_pca = pca.transform(dtm_test)

Build DataFrame of PCA Components

In [64]:
col_names = ['PC_' + str(i) for i in range(1, pca.get_params()['n_components']+1)]
dtm_train_pca_df = pd.DataFrame(dtm_train_pca, columns=col_names)
dtm_test_pca_df = pd.DataFrame(dtm_test_pca, columns=col_names)

Combine PCA components and sentiment analysis scores into one DataFrame

In [69]:
X_train_post_dtm_pca = pd.concat([X_train[['body_polarity','body_subjectivity']].reset_index(drop=True), dtm_train_pca_df], axis=1)
X_test_post_dtm_pca = pd.concat([X_test[['body_polarity','body_subjectivity']].reset_index(drop=True),dtm_test_pca_df], axis=1)

In [72]:
X_train_post_dtm_pca.head()

Unnamed: 0,body_polarity,body_subjectivity,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,...,PC_791,PC_792,PC_793,PC_794,PC_795,PC_796,PC_797,PC_798,PC_799,PC_800
0,0.045523,0.447194,-3.2301,-0.506262,-0.139275,1.216879,-0.989355,-0.290268,-0.460918,0.695942,...,-0.305248,-0.081111,-0.22889,0.225672,-0.131362,-0.158019,-0.091304,0.118803,-0.142652,0.020049
1,0.123488,0.48796,-1.69964,2.655335,-3.511593,0.71025,-1.468153,-0.420901,-0.444954,-0.457874,...,0.000567,-0.002906,-0.033091,-0.162508,-0.052741,0.32119,-0.363125,-0.091853,0.271374,0.200574
2,0.366667,0.689286,-4.890311,0.851776,0.387777,-0.448441,0.932522,0.722537,0.917917,0.125428,...,0.01683,0.018513,-0.056724,-0.201364,0.031839,-0.126647,0.095873,0.089574,-0.006338,0.071553
3,0.111569,0.346238,2.095763,-4.452893,-1.538845,2.611905,-3.823879,2.705971,-1.287494,-1.660388,...,0.128382,-0.090228,-0.156075,0.595058,-0.078841,-0.040142,0.214096,0.097223,-0.047814,0.137292
4,-0.013474,0.466071,-2.871271,0.770379,3.248285,0.054712,-1.480608,-0.512396,-0.607933,-0.719146,...,-0.134697,-0.139258,0.085203,0.153525,-0.224926,0.189807,0.074606,0.283557,0.327423,0.715444


In [75]:
print(X_train_post_dtm_pca.shape, y_train.shape)
print(X_test_post_dtm_pca.shape, y_test.shape)

(68218, 802) (68218,)
(29237, 802) (29237,)


## Random Forest Models

### 1. Vanilla Random Forest, no tuning hyperparameters 

In [76]:
rfc = RandomForestClassifier()
rfc.fit(X_train_post_dtm_pca, y_train)

RandomForestClassifier()

In [78]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [79]:
print(f'Training Accuracy = {rfc.score(X_train_post_dtm_pca, y_train)}')

Training Accuracy = 1.0


In [80]:
print(f'Testing Accuracy = {rfc.score(X_test_post_dtm_pca, y_test)}')

Testing Accuracy = 0.7267161473475391


In [83]:
cross_val_score(rfc, X_test_post_dtm_pca, y_test, cv=5, scoring='accuracy')

array([0.72708618, 0.72435021, 0.72601334, 0.72721054, 0.72618437])