In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [3]:
from collections import Counter

In [4]:
from imblearn.under_sampling import RandomUnderSampler

In [5]:
# # #funtion to cut off extra intro paragraphs from beautiful soup scrape

def trim_fat(string):
    return string[35:-115]

In [6]:
# function to vectorize the type_of_material series into a y target vector.

def vectorize_type(ser):
    y = ser.copy()
    y.replace({'Op-Ed': 1,'News': 0}, inplace=True)
    return y

In [7]:
#rate of correct predictions out of total predictions
def metrics_(tn, fp, fn, tp):
    accuracy = (tp + tn) / (tn + fn + tp + fp)
    print(f'accuracy = {accuracy}')
    recall = (tp) / (tp + fn)
    print(f'recall = {recall}')
    precision = (tp) / (tp + fp)
    print(f'precision = {precision}')

In [8]:
%%time
_2019 = pd.read_csv('data/2019_text_type.csv', index_col='Unnamed: 0')

CPU times: user 1.7 s, sys: 281 ms, total: 1.98 s
Wall time: 2 s


In [9]:
%%time
_2019_df = _2019.copy()

CPU times: user 763 µs, sys: 67 µs, total: 830 µs
Wall time: 782 µs


In [62]:
#define X. X is currently pandas series of unsplit strings

X = _2019_df.text

In [63]:
X = X.apply(lambda x: trim_fat(x))

In [66]:
X.head()

0    From the Treaty of Versailles to Prohibition, ...
1    Imagine what we could do with our money, and h...
2    Can the Constitution withstand the partisans?'...
3    The Christian right doesn’t like the president...
4    The United States is spending beyond its means...
Name: text, dtype: object

In [64]:
type(X)

pandas.core.series.Series

In [14]:
# define y as a series of op-ed or news

y = _2019_df.type_of_material

In [15]:
%%time
# vectorize y in to (1, 0) (op-ed, news)

y = vectorize_type(y)

CPU times: user 12.4 ms, sys: 1.58 ms, total: 14 ms
Wall time: 13.4 ms


In [16]:
# %%time
# #split X for vectorization, lolz

# corpus = X.apply(lambda x: split_trim(x))

In [17]:
#turn series into list...

corpus = list(X)

# vectorize X 

In [18]:
%%time
#create vectorizer

vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
X = vectorizer.fit_transform(corpus)

CPU times: user 29.5 s, sys: 685 ms, total: 30.2 s
Wall time: 30.5 s


In [19]:
X.shape

(41748, 237438)

In [20]:
%%time
feature_names = vectorizer.get_feature_names()

CPU times: user 221 ms, sys: 5.26 ms, total: 227 ms
Wall time: 228 ms


In [21]:
%%time
stop_words = vectorizer.get_stop_words()

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 11 µs


In [22]:
feature_names;

In [23]:
len(feature_names)

237438

In [24]:
len(stop_words)

318

# resample class size w/ imbalanced learn

In [25]:
y.shape

(41748,)

In [26]:
X.shape

(41748, 237438)

In [27]:
%%time
#balance the classes

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

CPU times: user 28 µs, sys: 0 ns, total: 28 µs
Wall time: 32.2 µs


In [28]:
%%time
#X, y --> X_resampled, y_resampled
X_resampled, y_resampled = rus.fit_resample(X, y)


#return a list of tuples for item, and count of item. in this case 4139 each
print(sorted(Counter(y_resampled).items()))

[(0, 4139), (1, 4139)]
CPU times: user 31.5 ms, sys: 11.3 ms, total: 42.8 ms
Wall time: 42.1 ms


In [29]:
y_resampled.shape

(8278,)

In [30]:
X_resampled.shape

(8278, 237438)

In [31]:
%%time
#test, train, split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=50)

CPU times: user 14.6 ms, sys: 9.81 ms, total: 24.4 ms
Wall time: 24.2 ms


# Multinomial Naive Bayes/imbalanced learn/TFIDF vectorizer

In [32]:
# %%time

# multinm_clf = MultinomialNB()
# multinm_clf.fit(X_train, y_train)

In [33]:
#multinm_clf.class_count_

In [34]:
#y_pred = multinm_clf.predict(X_test)

In [35]:
# #clf.score = accuracy = 'true'(pos/neg) / total

# multinm_clf.score(X_test, y_test)

In [36]:
#confusion_matrix(y_true = y_test, y_pred = y_pred)

In [37]:
# tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
# (tn, fp, fn, tp)

In [38]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# accuracy

In [39]:
# # rate of actual op-ed articles, out of all the actual od-ed articles

# recall = (tp) / (tp + fn)
# recall

In [40]:
# # rate of correct predictions of op-ed articles out of all predictions

# precision = (tp) / (tp + fp)
# precision

In [41]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# print(f'accuracy = {accuracy}')
# recall = (tp) / (tp + fn)
# print(f'recall = {recall}')
# precision = (tp) / (tp + fp)
# print(f'precision = {precision}')

# random forest classifier

In [42]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

In [43]:
%%time

rf_clf = RandomForestClassifier(max_depth=2, random_state=0)

CPU times: user 54 µs, sys: 6 µs, total: 60 µs
Wall time: 63.9 µs


In [44]:
%%time

rf_clf.fit(X_train, y_train)

CPU times: user 429 ms, sys: 35.7 ms, total: 465 ms
Wall time: 469 ms


RandomForestClassifier(max_depth=2, random_state=0)

In [45]:
y_pred = rf_clf.predict(X_test)

In [46]:
#clf.score = accuracy = 'true'(pos/neg) / total

rf_clf.score(X_test, y_test)

0.9241545893719807

In [47]:
confusion_matrix(y_true = y_test, y_pred = y_pred)

array([[991,  29],
       [128, 922]])

In [48]:
tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
(tn, fp, fn, tp)

(991, 29, 128, 922)

In [49]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# accuracy

In [50]:
# # rate of actual op-ed articles, out of all the actual od-ed articles

# recall = (tp) / (tp + fn)
# recall

In [51]:
# # rate of correct predictions of op-ed articles out of all predictions

# precision = (tp) / (tp + fp)
# precision

In [52]:
#rate of correct predictions out of total predictions

accuracy = (tp + tn) / (tn + fn + tp + fp)
print(f'accuracy = {accuracy}')
recall = (tp) / (tp + fn)
print(f'recall = {recall}')
precision = (tp) / (tp + fp)
print(f'precision = {precision}')

accuracy = 0.9241545893719807
recall = 0.878095238095238
precision = 0.9695057833859095


In [53]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

# what is this random forest doing?
1. takes all X and y which is my text and classifiers as vectors(tfidf)
2. take a random number of 8278 instances (tfidf vector) and uses a random number of 219112 features to make best decision.
3. bags/bootstraps that model
4. does it again a bunch of times

# accuracy, recall, precision

In [54]:
# #rate of correct predictions out of total predictions
# def metrics_(tn, fp, fn, tp):
#     accuracy = (tp + tn) / (tn + fn + tp + fp)
#     print(f'accuracy = {accuracy}')
#     recall = (tp) / (tp + fn)
#     print(f'recall = {recall}')
#     precision = (tp) / (tp + fp)
#     print(f'precision = {precision}')

In [55]:
#metrics_(tn, fp, fn, tp)

# feature engineering

In [56]:
# get bag of words
# get sparse matrix
# overlay bag of words onto sparce matrix
# argsort to find most important (highest number) word that it's splitting w/ most infomation gain/ least entroy... whatever

In [57]:
len(feature_names)

237438

In [58]:
vectorizer.vocabulary_

{'treaty': 214387,
 'versailles': 223831,
 'prohibition': 167331,
 'events': 70614,
 'year': 234159,
 'shaped': 189970,
 'america': 11509,
 'world': 231642,
 'century': 38941,
 'come': 45437,
 'ted': 208042,
 'widmer': 229706,
 'mr': 140847,
 'distinguished': 60370,
 'lecturer': 120071,
 'macaulay': 126248,
 'honors': 96692,
 'college': 45026,
 'city': 42916,
 'university': 220087,
 'new': 145604,
 'york': 234833,
 '2019': 1712,
 'times': 211372,
 'opinion': 152167,
 'section': 187439,
 'publish': 168379,
 'occasional': 150115,
 'series': 188827,
 'essays': 69784,
 'ways': 227822,
 '1919': 1419,
 'following': 77209,
 'essay': 69777,
 'crack': 49701,
 'scott': 186633,
 'fitzgerald': 75855,
 'wrote': 232017,
 'test': 209169,
 'rate': 172120,
 'intelligence': 102915,
 'ability': 5604,
 'hold': 96082,
 'opposed': 152234,
 'ideas': 99459,
 'mind': 136571,
 'time': 211336,
 'retain': 176684,
 'function': 79924,
 'years': 234177,
 'day': 54088,
 'headlines': 92868,
 'gave': 81982,
 'hint': 95

In [59]:
feature_names;

In [60]:
_2019_df.text.head(500)

0      ['Advertisement', 'Supported by', 'From the Tr...
1      ['Advertisement', 'Supported by', 'Imagine wha...
2      ['Advertisement', 'Supported by', 'Can the Con...
3      ['Advertisement', 'Supported by', 'The Christi...
4      ['Advertisement', 'Supported by', 'The United ...
                             ...                        
495    ['Advertisement', 'Supported by', 'By Zach Sch...
496    ['Advertisement', 'Supported by', 'transcript'...
497    ['Advertisement', 'Supported by', 'By Matt Ste...
498    ['Advertisement', 'Supported by', 'By Niki Kit...
499    ['Advertisement', 'Supported by', 'By Steven L...
Name: text, Length: 500, dtype: object

In [67]:
X

0        From the Treaty of Versailles to Prohibition, ...
1        Imagine what we could do with our money, and h...
2        Can the Constitution withstand the partisans?'...
3        The Christian right doesn’t like the president...
4        The United States is spending beyond its means...
                               ...                        
41743    Chief Justice John Roberts’s year-end report o...
41744    Zaosong Zheng, a promising cancer researcher, ...
41745    The 2010s, reviewed.', 'By Spencer Bokat-Linde...
41746    After receiving presidential clemency, Edward ...
41747    By Coral Davenport and Lisa Friedman', 'WASHIN...
Name: text, Length: 41748, dtype: object

In [69]:
corpus[:10]

["From the Treaty of Versailles to Prohibition, the events of that year shaped America, and the world, for a century to come. ', 'By Ted Widmer', 'Mr. Widmer is a distinguished lecturer at the Macaulay Honors College of the City University of New York.', 'Throughout 2019, The New York Times Opinion section will publish an occasional series of essays on the ways in which the events of 1919 shaped the following century. ', 'In his essay “The Crack-Up,” F. Scott Fitzgerald wrote, “The test of a first-rate intelligence is the ability to hold two opposed ideas in the mind at the same time, and still retain the ability to function.”', 'On New Year’s Day 1919, the headlines in The New York Times gave a hint of how difficult that would be for Americans, struggling to live up to the shimmering promises they had made to the world during the Great War, which had ended just over a month before.', 'Poles, newly independent but already threatened by their neighbors, were calling on Americans to prot

In [None]:
'Imagine what we could do with our money, and hours, if we set our phones aside for a year.\', \'By Paul Greenberg\', \'Mr. Greenberg is a fellow at the Safina Center.\', 