In [98]:
import pandas as pd

In [99]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [100]:
from collections import Counter

In [101]:
from imblearn.under_sampling import RandomUnderSampler

In [158]:
# # #funtion to cut off extra intro paragraphs from beautiful soup scrape

def trim_fat(string):
    return string[35:-115]

In [159]:
# function to vectorize the type_of_material series into a y target vector.

def vectorize_type(ser):
    y = ser.copy()
    y.replace({'Op-Ed': 1,'News': 0}, inplace=True)
    return y

In [304]:
#rate of correct predictions out of total predictions
def metrics_(tn, fp, fn, tp):
    accuracy = (tp + tn) / (tn + fn + tp + fp)
    print(f'accuracy = {accuracy}')
    recall = (tp) / (tp + fn)
    print(f'recall = {recall}')
    precision = (tp) / (tp + fp)
    print(f'precision = {precision}')

In [160]:
%%time
_2019 = pd.read_csv('data/2019_text_type.csv', index_col='Unnamed: 0')

CPU times: user 1.74 s, sys: 342 ms, total: 2.08 s
Wall time: 2.1 s


In [161]:
%%time
_2019_df = _2019.copy()

CPU times: user 807 µs, sys: 20 µs, total: 827 µs
Wall time: 797 µs


In [165]:
#define X. X is currently pandas series of unsplit strings

X = _2019_df.text

In [166]:
X

0        ['Advertisement', 'Supported by', 'From the Tr...
1        ['Advertisement', 'Supported by', 'Imagine wha...
2        ['Advertisement', 'Supported by', 'Can the Con...
3        ['Advertisement', 'Supported by', 'The Christi...
4        ['Advertisement', 'Supported by', 'The United ...
                               ...                        
41743    ['Advertisement', 'Supported by', 'Chief Justi...
41744    ['Advertisement', 'Supported by', 'Zaosong Zhe...
41745    ['Advertisement', 'Supported by', 'The 2010s, ...
41746    ['Advertisement', 'Supported by', 'After recei...
41747    ['Advertisement', 'Supported by', 'By Coral Da...
Name: text, Length: 41748, dtype: object

In [168]:
X = X.apply(lambda x: trim_fat(x))

In [171]:
type(X)

pandas.core.series.Series

In [172]:
# define y as a series of op-ed or news

y = _2019_df.type_of_material

In [173]:
%%time
# vectorize y in to (1, 0) (op-ed, news)

y = vectorize_type(y)

CPU times: user 12.6 ms, sys: 1.99 ms, total: 14.6 ms
Wall time: 13.5 ms


In [174]:
# %%time
# #split X for vectorization, lolz

# corpus = X.apply(lambda x: split_trim(x))

In [175]:
#turn series into list...

corpus = list(X)

# vectorize X 

In [262]:
%%time
#create vectorizer

vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
X = vectorizer.fit_transform(corpus)

CPU times: user 28.8 s, sys: 754 ms, total: 29.6 s
Wall time: 29.8 s


In [263]:
X.shape

(41748, 237438)

In [264]:
%%time
feature_names = vectorizer.get_feature_names()

CPU times: user 225 ms, sys: 5.71 ms, total: 231 ms
Wall time: 232 ms


In [265]:
%%time
stop_words = vectorizer.get_stop_words()

CPU times: user 12 µs, sys: 5 µs, total: 17 µs
Wall time: 30 µs


In [266]:
feature_names;

In [267]:
len(feature_names)

237438

In [268]:
len(stop_words)

318

# resample class size w/ imbalanced learn

In [269]:
y.shape

(41748,)

In [270]:
X.shape

(41748, 237438)

In [271]:
%%time
#balance the classes

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

CPU times: user 150 µs, sys: 5.96 ms, total: 6.11 ms
Wall time: 6.08 ms


In [272]:
%%time
#X, y --> X_resampled, y_resampled
X_resampled, y_resampled = rus.fit_resample(X, y)


#return a list of tuples for item, and count of item. in this case 4139 each
print(sorted(Counter(y_resampled).items()))

[(0, 4139), (1, 4139)]
CPU times: user 31.3 ms, sys: 42.5 ms, total: 73.9 ms
Wall time: 73 ms


In [273]:
y_resampled.shape

(8278,)

In [274]:
X_resampled.shape

(8278, 237438)

In [277]:
%%time
#test, train, split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=50)

CPU times: user 13.8 ms, sys: 15.5 ms, total: 29.3 ms
Wall time: 29 ms


# Multinomial Naive Bayes/imbalanced learn/TFIDF vectorizer

In [278]:
# %%time

# multinm_clf = MultinomialNB()
# multinm_clf.fit(X_train, y_train)

In [279]:
#multinm_clf.class_count_

In [280]:
#y_pred = multinm_clf.predict(X_test)

In [281]:
# #clf.score = accuracy = 'true'(pos/neg) / total

# multinm_clf.score(X_test, y_test)

In [282]:
#confusion_matrix(y_true = y_test, y_pred = y_pred)

In [283]:
# tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
# (tn, fp, fn, tp)

In [284]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# accuracy

In [285]:
# # rate of actual op-ed articles, out of all the actual od-ed articles

# recall = (tp) / (tp + fn)
# recall

In [286]:
# # rate of correct predictions of op-ed articles out of all predictions

# precision = (tp) / (tp + fp)
# precision

In [287]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# print(f'accuracy = {accuracy}')
# recall = (tp) / (tp + fn)
# print(f'recall = {recall}')
# precision = (tp) / (tp + fp)
# print(f'precision = {precision}')

# random forest classifier

In [288]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

In [289]:
%%time

rf_clf = RandomForestClassifier(max_depth=2, random_state=0)

CPU times: user 53 µs, sys: 0 ns, total: 53 µs
Wall time: 55.8 µs


In [290]:
%%time

rf_clf.fit(X_train, y_train)

CPU times: user 414 ms, sys: 23.8 ms, total: 438 ms
Wall time: 442 ms


RandomForestClassifier(max_depth=2, random_state=0)

In [291]:
y_pred = rf_clf.predict(X_test)

In [292]:
#clf.score = accuracy = 'true'(pos/neg) / total

rf_clf.score(X_test, y_test)

0.9241545893719807

In [293]:
confusion_matrix(y_true = y_test, y_pred = y_pred)

array([[991,  29],
       [128, 922]])

In [294]:
tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
(tn, fp, fn, tp)

(991, 29, 128, 922)

In [295]:
#rate of correct predictions out of total predictions

accuracy = (tp + tn) / (tn + fn + tp + fp)
accuracy

0.9241545893719807

In [296]:
# rate of actual op-ed articles, out of all the actual od-ed articles

recall = (tp) / (tp + fn)
recall

0.878095238095238

In [297]:
# rate of correct predictions of op-ed articles out of all predictions

precision = (tp) / (tp + fp)
precision

0.9695057833859095

In [298]:
#rate of correct predictions out of total predictions

accuracy = (tp + tn) / (tn + fn + tp + fp)
print(f'accuracy = {accuracy}')
recall = (tp) / (tp + fn)
print(f'recall = {recall}')
precision = (tp) / (tp + fp)
print(f'precision = {precision}')

accuracy = 0.9241545893719807
recall = 0.878095238095238
precision = 0.9695057833859095


In [299]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

In [300]:
rf_clf = RandomForestClassifier(max_depth=2, random_state=0)

In [252]:
#what is X called in this case? space matrix?

X_resampled.shape

(8278, 221213)

In [253]:
y_resampled.shape

(8278,)

# what is this random forest doing?
1. takes all X and y which is my text and classifiers as vectors(tfidf)
2. take a random number of 8278 instances (tfidf vector) and uses a random number of 219112 features to make best decision.
3. bags/bootstraps that model
4. does it again a bunch of times

# reuben test

In [215]:
# rf_clf_test = RandomForestClassifier()

In [216]:
# %%time
# rf_clf.fit(X_train, y_train)

In [217]:
# y_pred = rf_clf.predict(X_test)

In [218]:
# confusion_matrix(y_true = y_test, y_pred = y_pred)

# accuracy, recall, precision

In [305]:
# #rate of correct predictions out of total predictions
# def metrics_(tn, fp, fn, tp):
#     accuracy = (tp + tn) / (tn + fn + tp + fp)
#     print(f'accuracy = {accuracy}')
#     recall = (tp) / (tp + fn)
#     print(f'recall = {recall}')
#     precision = (tp) / (tp + fp)
#     print(f'precision = {precision}')

In [306]:
#metrics_(tn, fp, fn, tp)

# feture engineering

In [220]:
# get bag of words
# get sparse matrix
# overlay bag of words onto sparce matrix
# argsort to find most important (highest number) word that it's splitting w/ most infomation gain/ least entroy... whatever

In [221]:
len(feature_names)

221213

In [222]:
vectorizer.vocabulary_

{'treaty': 199342,
 'versailles': 208320,
 'prohibition': 155231,
 'events': 65915,
 'year': 217896,
 'shaped': 176498,
 'america': 10895,
 'world': 215513,
 'century': 36296,
 'come': 42369,
 'ted': 193427,
 'widmer': 213752,
 'mr': 130435,
 'distinguished': 56295,
 'lecturer': 111549,
 'macaulay': 117136,
 'honors': 89953,
 'college': 41988,
 'city': 40011,
 'university': 204772,
 'new': 134950,
 'york': 218521,
 '2019': 1602,
 'times': 196508,
 'opinion': 140971,
 'section': 174157,
 'publish': 156270,
 'occasional': 139171,
 'series': 175441,
 'essays': 65123,
 'ways': 212048,
 '1919': 1398,
 'following': 72075,
 'essay': 65116,
 'crack': 46428,
 'scott': 173402,
 'fitzgerald': 70796,
 'wrote': 215864,
 'test': 194485,
 'rate': 159813,
 'intelligence': 95891,
 'ability': 5373,
 'hold': 89382,
 'opposed': 141036,
 'ideas': 92563,
 'mind': 126483,
 'time': 196472,
 'retain': 164195,
 'function': 74604,
 'day': 50482,
 'headlines': 86481,
 'gave': 76524,
 'hint': 88663,
 'difficult': 

In [223]:
feature_names

['00',
 '000',
 '0000',
 '0000001',
 '000001',
 '0000044',
 '00001',
 '0000797113',
 '0001',
 '0002',
 '00025',
 '0003',
 '0004',
 '0005',
 '0008',
 '000ers',
 '000s',
 '000th',
 '001',
 '0010',
 '0012',
 '0014',
 '0017',
 '001st',
 '002',
 '0020',
 '0025',
 '0026',
 '0028',
 '003',
 '0033',
 '0039',
 '004',
 '0042',
 '0046',
 '005',
 '0051',
 '006',
 '0064',
 '0065',
 '0069',
 '007',
 '0077',
 '007s',
 '008',
 '0083',
 '0086',
 '009',
 '00s',
 '01',
 '010',
 '0100',
 '0102',
 '011',
 '0115',
 '0116',
 '012',
 '013',
 '0139',
 '014',
 '015',
 '0153',
 '016',
 '0160',
 '017',
 '01740964',
 '018',
 '0180',
 '0186',
 '019',
 '01am',
 '01kylb8mda',
 '02',
 '020',
 '0200',
 '0202',
 '0203',
 '021',
 '0211',
 '0216',
 '021672',
 '022',
 '0221',
 '0222',
 '023',
 '0230',
 '0236',
 '024',
 '0248',
 '025',
 '0251',
 '0255',
 '026',
 '027',
 '0271',
 '0277',
 '028',
 '0280',
 '029',
 '0292',
 '0299',
 '03',
 '030',
 '0302',
 '0303',
 '031',
 '031019',
 '0312',
 '0313',
 '0318',
 '032',
 '032c',
