In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [3]:
from collections import Counter

In [4]:
from imblearn.under_sampling import RandomUnderSampler

In [5]:
# # #funtion to cut off extra paragraphs from beautiful soup scrape

# def split_trim(string):
#     text = string.split(" ")
#     text = text[3:-14]
#     return text

In [6]:
# function to vectorize the type_of_material series into a y target vector.

def vectorize_type(ser):
    y = ser.copy()
    y.replace({'Op-Ed': 1,'News': 0}, inplace=True)
    return y

In [7]:
%%time
_2019 = pd.read_csv('data/2019_text_type.csv', index_col='Unnamed: 0')

CPU times: user 1.65 s, sys: 234 ms, total: 1.89 s
Wall time: 1.89 s


In [8]:
%%time
_2019_df = _2019.copy()

CPU times: user 862 µs, sys: 177 µs, total: 1.04 ms
Wall time: 1.03 ms


In [9]:
#define X. X is currently pandas series of unsplit strings

X = _2019_df.text

In [10]:
# define y as a series of op-ed or news

y = _2019_df.type_of_material

In [11]:
%%time
# vectorize y in to (1, 0) (op-ed, news)

y = vectorize_type(y)

CPU times: user 13.1 ms, sys: 1.85 ms, total: 14.9 ms
Wall time: 14.1 ms


In [12]:
# %%time
# #split X for vectorization, lolz

# corpus = X.apply(lambda x: split_trim(x))

In [13]:
#turn series into list...

corpus = list(X)

# vectorize X 

In [14]:
%%time
#create vectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

CPU times: user 26.8 s, sys: 413 ms, total: 27.3 s
Wall time: 27.3 s


In [15]:
%%time
feature_names = vectorizer.get_feature_names()

CPU times: user 180 ms, sys: 3.26 ms, total: 184 ms
Wall time: 183 ms


In [16]:
len(feature_names)

219112

# resample class size w/ imbalanced learn

In [17]:
y.shape

(41748,)

In [18]:
X.shape

(41748, 219112)

In [19]:
%%time
#balance the classes

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

CPU times: user 30 µs, sys: 0 ns, total: 30 µs
Wall time: 35 µs


In [20]:
%%time
#X, y --> X_resampled, y_resampled
X_resampled, y_resampled = rus.fit_resample(X, y)


#return a list of tuples for item, and count of item. in this case 4139 each
print(sorted(Counter(y_resampled).items()))

[(0, 4139), (1, 4139)]
CPU times: user 36.1 ms, sys: 10 ms, total: 46.1 ms
Wall time: 45.2 ms


In [21]:
y_resampled.shape

(8278,)

In [22]:
X_resampled.shape

(8278, 219112)

In [23]:
%%time
#test, train, split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=50)

CPU times: user 13.1 ms, sys: 9.05 ms, total: 22.1 ms
Wall time: 21.3 ms


# Multinomial Naive Bayes/imbalanced learn/TFIDF vectorizer

In [24]:
%%time

multinm_clf = MultinomialNB()
multinm_clf.fit(X_train, y_train)

CPU times: user 24.6 ms, sys: 14.4 ms, total: 39 ms
Wall time: 37.9 ms


MultinomialNB()

In [25]:
multinm_clf.class_count_

array([3119., 3089.])

In [26]:
y_pred = multinm_clf.predict(X_test)

### clf.score = accuracy = 'true'(pos/neg) / total

In [27]:
#clf.score = accuracy = 'true'(pos/neg) / total

multinm_clf.score(X_test, y_test)

0.7840579710144927

In [28]:
confusion_matrix(y_true = y_test, y_pred = y_pred)

array([[ 590,  430],
       [  17, 1033]])

In [29]:
tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
(tn, fp, fn, tp)

(590, 430, 17, 1033)

In [30]:
#rate of correct predictions out of total predictions

accuracy = (tp + tn) / (tn + fn + tp + fp)
accuracy

0.7840579710144927

In [31]:
# rate of actual op-ed articles, out of all the actual od-ed articles

recall = (tp) / (tp + fn)
recall

0.9838095238095238

In [32]:
# rate of correct predictions of op-ed articles out of all predictions

precision = (tp) / (tp + fp)
precision

0.7060833902939166

# random forest classifier

In [33]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

In [34]:
%%time
rf_clf = RandomForestClassifier(max_depth=2, random_state=0)

CPU times: user 61 µs, sys: 10 µs, total: 71 µs
Wall time: 76.1 µs


In [35]:
%%time
rf_clf.fit(X_train, y_train)

CPU times: user 398 ms, sys: 20.7 ms, total: 419 ms
Wall time: 419 ms


RandomForestClassifier(max_depth=2, random_state=0)

In [36]:
y_pred = rf_clf.predict(X_test)

In [37]:
#clf.score = accuracy = 'true'(pos/neg) / total

rf_clf.score(X_test, y_test)

0.9478260869565217

In [38]:
confusion_matrix(y_true = y_test, y_pred = y_pred)

array([[990,  30],
       [ 78, 972]])

In [39]:
tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
(tn, fp, fn, tp)

(990, 30, 78, 972)

In [40]:
#rate of correct predictions out of total predictions

accuracy = (tp + tn) / (tn + fn + tp + fp)
accuracy

0.9478260869565217

In [41]:
# rate of actual op-ed articles, out of all the actual od-ed articles

recall = (tp) / (tp + fn)
recall

0.9257142857142857

In [42]:
# rate of correct predictions of op-ed articles out of all predictions

precision = (tp) / (tp + fp)
precision

0.9700598802395209