In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [3]:
from collections import Counter

In [4]:
from imblearn.under_sampling import RandomUnderSampler

# NLP Pipeline 

![](images/pipeline-walkthrough1.png)

Below is a to do list when converting text into vector form: 

**Clean text and Create a Bag of Words (BoW)**
>1. Lowercase the text
2. Tokenize 
3. Strip out punctuation or undesirable text
4. Remove Stopwords 
5. Stemming or Lemmatizing
6. Compute N-Grams
7. Use this to create BoW

**Vectorize BoW**
>8. Term Frequencies
9. Document Frequencies
10. TF-IDF
11. Normalize vectors

Let's go through both what each of these steps are and how to do them in python with the following corpus of comments about data science...
 

In [5]:
#funtion to cut off extra intro paragraphs from beautiful soup scrape

# def trim_fat(string):
#     return string[35:-115]

In [6]:
# function to vectorize the type_of_material series into a y target vector.
def vectorize_type(ser):
    y = ser.copy()
    y.replace({'Op-Ed': 1,'News': 0}, inplace=True)
    return y

In [7]:
#rate of correct predictions out of total predictions
def metrics_(tn, fp, fn, tp):
    accuracy = (tp + tn) / (tn + fn + tp + fp)
    print(f'accuracy = {accuracy}')
    recall = (tp) / (tp + fn)
    print(f'recall = {recall}')
    precision = (tp) / (tp + fp)
    print(f'precision = {precision}')

In [8]:
%%time
_2019 = pd.read_csv('data/trim2019_text_type.csv', index_col='Unnamed: 0')

CPU times: user 1.56 s, sys: 268 ms, total: 1.83 s
Wall time: 1.84 s


In [9]:
%%time
_2019_df = _2019.copy()

CPU times: user 831 µs, sys: 177 µs, total: 1.01 ms
Wall time: 995 µs


In [11]:
#define X. X is currently pandas series of unsplit strings

X = _2019_df.text

In [12]:
type(X[0])

str

In [13]:
type(X)

pandas.core.series.Series

In [14]:
# define y as a series of op-ed or news

y = _2019_df.type_of_material

In [15]:
%%time
# vectorize y in to (1, 0) (op-ed, news)

y = vectorize_type(y)

CPU times: user 12.9 ms, sys: 2.44 ms, total: 15.4 ms
Wall time: 14.8 ms


In [17]:
#turn series into list...

corpus = list(X)

# sklearn TfidfVectorizer(stop_words='english', strip_accents='ascii')

In [18]:
%%time
#create vectorizer

vectorizer = TfidfVectorizer(#input='content', 
#                 encoding='utf-8', 
#                 decode_error='strict', 
                 strip_accents=None, 
                 lowercase=True, 
#                 preprocessor=None, 
#                 tokenizer=None, 
#                 analyzer='word', 
                 stop_words='english', 
#                 token_pattern='(?u)\b\w\w+\b', 
#                 ngram_range=(1, 1), 
#                 max_df=1.0, 
#                 min_df=1, 
                 max_features=None, 
#                 vocabulary=None, 
#                 binary=False, 
#                 dtype=<class 'numpy.float64'>, 
#                 norm='l2', 
#                 use_idf=True, 
#                 smooth_idf=True, 
#                 sublinear_tf=False
)
X = vectorizer.fit_transform(corpus)

CPU times: user 24.9 s, sys: 457 ms, total: 25.4 s
Wall time: 25.4 s


In [19]:
%%time
feature_names = vectorizer.get_feature_names()

CPU times: user 176 ms, sys: 3.7 ms, total: 180 ms
Wall time: 179 ms


In [20]:
%%time
stop_words = vectorizer.get_stop_words()

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 9.06 µs


In [21]:
feature_names;

In [22]:
len(feature_names)

215840

In [23]:
len(stop_words)

318

# resample class size w/ imbalanced learn

In [24]:
y.shape

(41748,)

In [25]:
X.shape

(41748, 215840)

In [26]:
%%time
#balance the classes

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

CPU times: user 43 µs, sys: 14 µs, total: 57 µs
Wall time: 201 µs


In [27]:
%%time
#X, y --> X_resampled, y_resampled
X_resampled, y_resampled = rus.fit_resample(X, y)


#return a list of tuples for item, and count of item. in this case 4139 each
print(sorted(Counter(y_resampled).items()))

[(0, 4139), (1, 4139)]
CPU times: user 28.6 ms, sys: 10.9 ms, total: 39.5 ms
Wall time: 38.4 ms


In [28]:
y_resampled.shape

(8278,)

In [29]:
X_resampled.shape

(8278, 215840)

In [30]:
%%time
#test, train, split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=50)

CPU times: user 12.5 ms, sys: 10.2 ms, total: 22.7 ms
Wall time: 22 ms


# Multinomial Naive Bayes/imbalanced learn/TFIDF vectorizer

In [31]:
# %%time

# multinm_clf = MultinomialNB()
# multinm_clf.fit(X_train, y_train)

In [32]:
#multinm_clf.class_count_

In [33]:
#y_pred = multinm_clf.predict(X_test)

In [34]:
# #clf.score = accuracy = 'true'(pos/neg) / total

# multinm_clf.score(X_test, y_test)

In [35]:
#confusion_matrix(y_true = y_test, y_pred = y_pred)

In [36]:
# tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
# (tn, fp, fn, tp)

In [37]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# accuracy

In [38]:
# # rate of actual op-ed articles, out of all the actual od-ed articles

# recall = (tp) / (tp + fn)
# recall

In [39]:
# # rate of correct predictions of op-ed articles out of all predictions

# precision = (tp) / (tp + fp)
# precision

In [40]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# print(f'accuracy = {accuracy}')
# recall = (tp) / (tp + fn)
# print(f'recall = {recall}')
# precision = (tp) / (tp + fp)
# print(f'precision = {precision}')

# random forest classifier

In [41]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

In [42]:
%%time

rf_clf = RandomForestClassifier(max_depth=2, random_state=0)

CPU times: user 43 µs, sys: 0 ns, total: 43 µs
Wall time: 46 µs


In [43]:
%%time

rf_clf.fit(X_train, y_train)

CPU times: user 372 ms, sys: 30.2 ms, total: 402 ms
Wall time: 402 ms


RandomForestClassifier(max_depth=2, random_state=0)

In [44]:
y_pred = rf_clf.predict(X_test)

In [45]:
#clf.score = accuracy = 'true'(pos/neg) / total

rf_clf.score(X_test, y_test)

0.9415458937198068

In [46]:
confusion_matrix(y_true = y_test, y_pred = y_pred)

array([[1003,   17],
       [ 104,  946]])

In [47]:
tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
(tn, fp, fn, tp)

(1003, 17, 104, 946)

In [48]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# accuracy

In [49]:
# # rate of actual op-ed articles, out of all the actual od-ed articles

# recall = (tp) / (tp + fn)
# recall

In [50]:
# # rate of correct predictions of op-ed articles out of all predictions

# precision = (tp) / (tp + fp)
# precision

In [51]:
#rate of correct predictions out of total predictions

accuracy = (tp + tn) / (tn + fn + tp + fp)
print(f'accuracy = {accuracy}')
recall = (tp) / (tp + fn)
print(f'recall = {recall}')
precision = (tp) / (tp + fp)
print(f'precision = {precision}')

accuracy = 0.9415458937198068
recall = 0.900952380952381
precision = 0.9823468328141225


In [52]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

# what is this random forest doing?
1. takes all X and y which is my text and classifiers as vectors(tfidf)
2. take a random number of 8278 instances (tfidf vector) and uses a random number of 219112 features to make best decision.
3. bags/bootstraps that model
4. does it again a bunch of times

# accuracy, recall, precision

In [53]:
# #rate of correct predictions out of total predictions
# def metrics_(tn, fp, fn, tp):
#     accuracy = (tp + tn) / (tn + fn + tp + fp)
#     print(f'accuracy = {accuracy}')
#     recall = (tp) / (tp + fn)
#     print(f'recall = {recall}')
#     precision = (tp) / (tp + fp)
#     print(f'precision = {precision}')

In [54]:
#metrics_(tn, fp, fn, tp)

# feature engineering

In [55]:
# get bag of words
# get sparse matrix
# overlay bag of words onto sparce matrix
# argsort to find most important (highest number) word that it's splitting w/ most infomation gain/ least entroy... whatever

In [56]:
len(feature_names)

215840

In [57]:
vocab = vectorizer.vocabulary_

In [58]:
feature_names

['00',
 '000',
 '0000',
 '0000001',
 '000001',
 '0000044',
 '00001',
 '0000797113',
 '0001',
 '0002',
 '00025',
 '0003',
 '0004',
 '0005',
 '0008',
 '000ers',
 '000s',
 '000th',
 '001',
 '0010',
 '0012',
 '0014',
 '0017',
 '001st',
 '002',
 '0020',
 '0025',
 '0026',
 '0028',
 '003',
 '0033',
 '0039',
 '004',
 '0042',
 '0046',
 '005',
 '0051',
 '006',
 '0064',
 '0065',
 '0069',
 '007',
 '0077',
 '007s',
 '008',
 '0083',
 '0086',
 '009',
 '00s',
 '01',
 '010',
 '0100',
 '0102',
 '011',
 '0115',
 '0116',
 '012',
 '013',
 '0139',
 '014',
 '015',
 '0150',
 '0153',
 '016',
 '0160',
 '017',
 '018',
 '0180',
 '0186',
 '019',
 '01am',
 '01kylb8mda',
 '02',
 '020',
 '0200',
 '0202',
 '0203',
 '021',
 '0211',
 '0216',
 '021672',
 '022',
 '0221',
 '0222',
 '023',
 '0230',
 '0236',
 '024',
 '0248',
 '025',
 '0251',
 '0255',
 '026',
 '027',
 '0271',
 '0277',
 '028',
 '0280',
 '029',
 '0292',
 '0299',
 '03',
 '030',
 '0302',
 '0303',
 '031',
 '031019',
 '0312',
 '0313',
 '0318',
 '032',
 '032c',
 '03

In [59]:
# TfidfVectorizer(*, 
#                 input='content', 
#                 encoding='utf-8', 
#                 decode_error='strict', 
#                 strip_accents=None, 
#                 lowercase=Truelowercase=True 
#                 preprocessor=None, 
#                 tokenizer=None, 
#                 analyzer='word', 
#                 stop_words=None, 
#                 token_pattern='(?u)\b\w\w+\b', 
#                 ngram_range=(1, 1), 
#                 max_df=1.0, 
#                 min_df=1, 
#                 max_features=None, 
#                 vocabulary=None, 
#                 binary=False, 
#                 dtype=<class 'numpy.float64'>, 
#                 norm='l2', 
#                 use_idf=True, 
#                 smooth_idf=True, 
#                 sublinear_tf=False)

# NLTK shit

In [60]:
from nltk.tokenize import word_tokenize

In [62]:
nltk_2019_df = _2019.copy()

In [64]:
#define X. X is currently pandas series of unsplit strings

X_nltk = nltk_2019_df.text

In [65]:
# define y as a series of op-ed or news

y_nktk = nltk_2019_df.type_of_material

In [67]:
%%time
# vectorize y in to (1, 0) (op-ed, news)

y = vectorize_type(y_nktk)

CPU times: user 12.9 ms, sys: 3.32 ms, total: 16.2 ms
Wall time: 15 ms


In [68]:
#turn series into list...

corpus_nltk = list(X_nltk)

## tokenize w/ nltk

In [None]:
%%time
tokenized = [word_tokenize(article.lower()) for article in corpus_nltk]