Here we're playing with time!!! 

In [1]:
import numpy as np
import time
import pandas as pd
import nltk

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [29]:
poli_dis_2010_df = pd.read_csv('./data/poli_dis_2010.csv')
poli_dis_2020_df = pd.read_csv('./data/poli_dis_2020.csv')

In [30]:
poli_dis_2010_df['year'] = 0
poli_dis_2020_df['year'] = 1

In [31]:
poli_dis_2020_df['year']

0      1
1      1
2      1
3      1
4      1
      ..
995    1
996    1
997    1
998    1
999    1
Name: year, Length: 1000, dtype: int64

In [32]:
poli_dis_2010_df.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'created_utc',
       'domain', 'full_link', 'id', 'is_self', 'media_embed', 'num_comments',
       'over_18', 'permalink', 'score', 'selftext', 'subreddit',
       'subreddit_id', 'thumbnail', 'title', 'url', 'author_created_utc',
       'author_fullname', 'edited', 'year'],
      dtype='object')

In [33]:
poli_dis_2010_df['id'].nunique()

1000

In [34]:
poli_dis_2010_df.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'created_utc',
       'domain', 'full_link', 'id', 'is_self', 'media_embed', 'num_comments',
       'over_18', 'permalink', 'score', 'selftext', 'subreddit',
       'subreddit_id', 'thumbnail', 'title', 'url', 'author_created_utc',
       'author_fullname', 'edited', 'year'],
      dtype='object')

In [35]:
poli_dis_2010_df['selftext']

0      Excellent history lesson on the economy for al...
1                                                    NaN
2       I always hear Communist Muslim Pinko not born...
3      The task of this branch is to hold the other 3...
4      Hey reddit, why do we care so much about SOPA ...
                             ...                        
995    What are you thoughts about it? Is it a good t...
996    I am not concerned with whether the law is con...
997    Does the electoral college work?   \nIf you ar...
998    This is not meant to be divisive or even philo...
999    I am not American, but I'm following the news ...
Name: selftext, Length: 1000, dtype: object

In [36]:
poli_dis_2020_df['id'].nunique()

1000

In [37]:
frames = [poli_dis_2010_df, poli_dis_2020_df]
master_df = pd.concat(frames)

In [38]:
master_df

Unnamed: 0,author,author_flair_css_class,author_flair_text,created_utc,domain,full_link,id,is_self,media_embed,num_comments,...,suggested_sort,total_awards_received,whitelist_status,wls,author_cakeday,author_flair_template_id,author_flair_text_color,author_flair_background_color,banned_by,distinguished
0,[deleted],,,1326374124,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,odv2p,True,{},5,...,,,,,,,,,,
1,magister0,,,1326366955,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,odsmk,True,{},0,...,,,,,,,,,,
2,bigdog6286,,,1326344639,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,odiiu,True,{},40,...,,,,,,,,,,
3,anonoman925,,,1326339753,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,odet1,True,{},20,...,,,,,,,,,,
4,CallMeBrimstone,,,1326332063,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,od8jy,True,{},2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,WaterzGrace,,,1585331139,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,fq1n07,True,,3,...,confidence,0.0,all_ads,6.0,,,,,,
996,curious_thoughts_uk,,,1585330878,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,fq1jqd,True,,3,...,confidence,0.0,all_ads,6.0,,,,,,
997,patrick-thegamerdad,,,1585330173,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,fq1azc,True,,0,...,confidence,0.0,all_ads,6.0,,,,,,
998,steelbaxton,,,1585329457,self.PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,fq1263,True,,1,...,confidence,0.0,all_ads,6.0,,,,,,


In [39]:
master_df['year']

0      0
1      0
2      0
3      0
4      0
      ..
995    1
996    1
997    1
998    1
999    1
Name: year, Length: 2000, dtype: int64

In [40]:
text_df = master_df[['year', 'selftext']].copy()

In [41]:
text_df['selftext'].nunique()

842

In [42]:
text_df.isnull().sum()

year          0
selftext    151
dtype: int64

In [43]:
#https://stackoverflow.com/questions/44227748/removing-newlines-from-messy-strings-in-pandas-dataframe-cells
text_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True) 

In [44]:
master_df['year']

0      0
1      0
2      0
3      0
4      0
      ..
995    1
996    1
997    1
998    1
999    1
Name: year, Length: 2000, dtype: int64

In [45]:
#text_df['year'] = text_df['year'].map({2010:1,2020:0})

In [46]:
text_df['year'].value_counts(normalize=True)

0    0.5
1    0.5
Name: year, dtype: float64

In [47]:
text_df['selftext'][:5]

0    Excellent history lesson on the economy for al...
1                                                  NaN
2     I always hear Communist Muslim Pinko not born...
3    The task of this branch is to hold the other 3...
4    Hey reddit, why do we care so much about SOPA ...
Name: selftext, dtype: object

In [48]:
lemmatizer = WordNetLemmatizer()


In [49]:
#text_df['body'] = text_df['body'].map(lemmatizer.lemmatize())

In [None]:
#text_df['body'] = text_df['body'].apply(', '.join)

In [61]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [62]:
def stem_stuff(line):
    tokenizer = RegexpTokenizer(r'\w+')
    line_tokens = tokenizer.tokenize(line.lower())
    stemmy = [stemmer.stem(word) for word in line_tokens]
    return stemmy

In [64]:
text_df['selftext'] = text_df['selftext'].map(stem_stuff)

AttributeError: 'float' object has no attribute 'lower'

In [20]:
text_df['body'] = text_df['body'].apply(', '.join)

In [21]:
text_df['body'][:5]

0     it, depend, on, how, fox, news, take, thi, stori
1    can, we, just, start, call, it, the, white, te...
2    sarah, palin, donkey, punch, donkey, punch, do...
3    gt, the, extrem, right, is, truli, about, feel...
4    gt, i, d, much, rather, have, a, less, radic, ...
Name: body, dtype: object

In [50]:
text_df['selftext'][:5]

0    Excellent history lesson on the economy for al...
1                                                  NaN
2     I always hear Communist Muslim Pinko not born...
3    The task of this branch is to hold the other 3...
4    Hey reddit, why do we care so much about SOPA ...
Name: selftext, dtype: object

In [51]:
X = text_df['selftext']
y = text_df['year']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state = 42, 
                                                    stratify = y)

In [53]:
X_train

538    I'm just curious, because this is the only pol...
879    And phase it in over 6-10 years to allow time ...
58                                             [removed]
9                                              [removed]
268    Native American Voting is making an impact and...
                             ...                        
836                                            [removed]
452    Does anyone else here think that copyright law...
387                                            [removed]
266                                            [removed]
537                                            [removed]
Name: selftext, Length: 1500, dtype: object

### Combine GridSearchCV and Bernoulli Naive Bayes model

In [54]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', BernoulliNB())
])

In [55]:
pipe_params = {
    'cvec__max_features':[2000, 3000, 4000, 5000],
    'cvec__min_df':[2, 3],
    'cvec__max_df':[.9, .95],
    'cvec__ngram_range':[(1, 1), (1, 2)]
}

In [56]:
gs = GridSearchCV(pipe, param_grid = pipe_params, cv = 5)

In [60]:
X_train

538    I'm just curious, because this is the only pol...
879    And phase it in over 6-10 years to allow time ...
58                                             [removed]
9                                              [removed]
268    Native American Voting is making an impact and...
                             ...                        
836                                            [removed]
452    Does anyone else here think that copyright law...
387                                            [removed]
266                                            [removed]
537                                            [removed]
Name: selftext, Length: 1500, dtype: object

In [57]:
gs.fit(X_train, y_train)

print(gs.best_score_)

print('train accuracy:', gs.score(X_train, y_train))

print('test accuracy:', gs.score(X_test, y_test))

Traceback (most recent call last):
  File "c:\users\derya\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\derya\appdata\local\programs\python\python39\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\users\derya\appdata\local\programs\python\python39\lib\site-packages\sklearn\pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\users\derya\appdata\local\programs\python\python39\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\users\derya\appdata\local\programs\python\python39\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\users\derya\appdata\local\programs\python\python39\lib\site-packages

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [58]:
type(X_test)

pandas.core.series.Series

In [59]:
preds = gs.predict(X_test)

NotFittedError: Vocabulary not fitted or provided

In [None]:
cm = confusion_matrix(y_test, preds)
tn, fp, fn, tp = cm.ravel()

ConfusionMatrixDisplay(cm).plot();

In [None]:
text_df

In [None]:
specificity = tn / (tn + fp)
print('specificity:', specificity)

### TFIDF Vectorizer

In [None]:
tvec = TfidfVectorizer()

In [None]:
X_train_df = pd.DataFrame(tvec.fit_transform(X_train).todense(),
                         columns = tvec.get_feature_names())

In [None]:
X_train_df.sum().sort_values(ascending=False).head(10).plot(kind='barh')

In [None]:
X_train_df.sum().sort_values(ascending=True).head(10).plot(kind='barh')

In [None]:
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
pipe_tvec_params = {
    'tvec__max_features': [2000, 3000, 4000, 5000],
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1, 1), (1, 2)]
}

In [None]:
gs_tvec = GridSearchCV(pipe_tvec, param_grid = pipe_tvec_params, cv=5)

In [None]:
gs_tvec.fit(X_train, y_train)

In [None]:
gs_tvec.score(X_train, y_train)

In [None]:
gs_tvec.score(X_test, y_test)

In [None]:
preds_tvec = gs_tvec.predict(X_test)

cm = confusion_matrix(y_test, preds_tvec)
tn, fp, fn, tp = cm.ravel()

specificity = tn / (tn + fp)
specificity

In [None]:
ConfusionMatrixDisplay(cm).plot();