In [105]:
import nltk

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [3]:
from collections import Counter

In [4]:
from imblearn.under_sampling import RandomUnderSampler

# NLP Pipeline 

![](images/pipeline-walkthrough1.png)

Below is a to do list when converting text into vector form: 

**Clean text and Create a Bag of Words (BoW)**
>1. Lowercase the text
2. Tokenize 
3. Strip out punctuation or undesirable text
4. Remove Stopwords 
5. Stemming or Lemmatizing
6. Compute N-Grams
7. Use this to create BoW

**Vectorize BoW**
>8. Term Frequencies
9. Document Frequencies
10. TF-IDF
11. Normalize vectors

Let's go through both what each of these steps are and how to do them in python with the following corpus of comments about data science...
 

In [5]:
#funtion to cut off extra intro paragraphs from beautiful soup scrape

# def trim_fat(string):
#     return string[35:-115]

In [6]:
# function to vectorize the type_of_material series into a y target vector.
def vectorize_type(ser):
    y = ser.copy()
    y.replace({'Op-Ed': 1,'News': 0}, inplace=True)
    return y

In [7]:
#rate of correct predictions out of total predictions
def metrics_(tn, fp, fn, tp):
    accuracy = (tp + tn) / (tn + fn + tp + fp)
    print(f'accuracy = {accuracy}')
    recall = (tp) / (tp + fn)
    print(f'recall = {recall}')
    precision = (tp) / (tp + fp)
    print(f'precision = {precision}')

In [8]:
%%time
_2019 = pd.read_csv('data/trim2019_text_type.csv', index_col='Unnamed: 0')

CPU times: user 1.51 s, sys: 254 ms, total: 1.77 s
Wall time: 1.77 s


In [9]:
%%time
_2019_df = _2019.copy()

CPU times: user 749 µs, sys: 171 µs, total: 920 µs
Wall time: 920 µs


In [10]:
#define X. X is currently pandas series of unsplit strings

X = _2019_df.text

In [11]:
type(X[0])

str

In [12]:
type(X)

pandas.core.series.Series

In [13]:
# define y as a series of op-ed or news

y = _2019_df.type_of_material

In [14]:
%%time
# vectorize y in to (1, 0) (op-ed, news)

y = vectorize_type(y)

CPU times: user 11.6 ms, sys: 1.95 ms, total: 13.5 ms
Wall time: 12.7 ms


In [15]:
#turn series into list...

corpus = list(X)

# sklearn TfidfVectorizer(stop_words='english', strip_accents='ascii')

In [16]:
%%time
#create vectorizer

vectorizer = TfidfVectorizer(#input='content', 
#                 encoding='utf-8', 
#                 decode_error='strict', 
                 strip_accents=None, 
                 lowercase=True, 
#                 preprocessor=None, 
#                 tokenizer=None, 
#                 analyzer='word', 
                 stop_words='english', 
#                 token_pattern='(?u)\b\w\w+\b', 
#                 ngram_range=(1, 1), 
#                 max_df=1.0, 
#                 min_df=1, 
                 max_features=None, 
#                 vocabulary=None, 
#                 binary=False, 
#                 dtype=<class 'numpy.float64'>, 
#                 norm='l2', 
#                 use_idf=True, 
#                 smooth_idf=True, 
#                 sublinear_tf=False
)
X = vectorizer.fit_transform(corpus)

CPU times: user 25 s, sys: 472 ms, total: 25.4 s
Wall time: 25.5 s


In [17]:
%%time
feature_names = vectorizer.get_feature_names()

CPU times: user 176 ms, sys: 2.95 ms, total: 179 ms
Wall time: 178 ms


In [18]:
%%time
stop_words = vectorizer.get_stop_words()

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 8.82 µs


In [118]:
feature_names;

In [20]:
len(feature_names)

215840

In [21]:
len(stop_words)

318

# resample class size w/ imbalanced learn

In [22]:
y.shape

(41748,)

In [23]:
X.shape

(41748, 215840)

In [24]:
%%time
#balance the classes

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

CPU times: user 23 µs, sys: 1e+03 ns, total: 24 µs
Wall time: 26 µs


In [25]:
%%time
#X, y --> X_resampled, y_resampled
X_resampled, y_resampled = rus.fit_resample(X, y)


#return a list of tuples for item, and count of item. in this case 4139 each
print(sorted(Counter(y_resampled).items()))

[(0, 4139), (1, 4139)]
CPU times: user 28.2 ms, sys: 11 ms, total: 39.2 ms
Wall time: 38.3 ms


In [26]:
y_resampled.shape

(8278,)

In [27]:
X_resampled.shape

(8278, 215840)

In [28]:
%%time
#test, train, split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=50)

CPU times: user 12.7 ms, sys: 10.1 ms, total: 22.9 ms
Wall time: 22 ms


# Multinomial Naive Bayes/imbalanced learn/TFIDF vectorizer

In [29]:
# %%time

# multinm_clf = MultinomialNB()
# multinm_clf.fit(X_train, y_train)

In [30]:
#multinm_clf.class_count_

In [31]:
#y_pred = multinm_clf.predict(X_test)

In [32]:
# #clf.score = accuracy = 'true'(pos/neg) / total

# multinm_clf.score(X_test, y_test)

In [33]:
#confusion_matrix(y_true = y_test, y_pred = y_pred)

In [34]:
# tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
# (tn, fp, fn, tp)

In [35]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# accuracy

In [36]:
# # rate of actual op-ed articles, out of all the actual od-ed articles

# recall = (tp) / (tp + fn)
# recall

In [37]:
# # rate of correct predictions of op-ed articles out of all predictions

# precision = (tp) / (tp + fp)
# precision

In [38]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# print(f'accuracy = {accuracy}')
# recall = (tp) / (tp + fn)
# print(f'recall = {recall}')
# precision = (tp) / (tp + fp)
# print(f'precision = {precision}')

# random forest classifier

In [39]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

In [226]:
%%time

rf_clf = RandomForestClassifier(max_depth=2, random_state=0)

CPU times: user 41 µs, sys: 0 ns, total: 41 µs
Wall time: 44.1 µs


In [227]:
%%time

rf_clf.fit(X_train, y_train)

CPU times: user 448 ms, sys: 53.7 ms, total: 502 ms
Wall time: 504 ms


RandomForestClassifier(max_depth=2, random_state=0)

In [228]:
y_pred = rf_clf.predict(X_test)

In [229]:
#clf.score = accuracy = 'true'(pos/neg) / total

rf_clf.score(X_test, y_test)

0.9289855072463769

In [230]:
confusion_matrix(y_true = y_test, y_pred = y_pred)

array([[972,  48],
       [ 99, 951]])

In [231]:
tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
(tn, fp, fn, tp)

(972, 48, 99, 951)

In [232]:
# #rate of correct predictions out of total predictions

# accuracy = (tp + tn) / (tn + fn + tp + fp)
# accuracy

In [233]:
# # rate of actual op-ed articles, out of all the actual od-ed articles

# recall = (tp) / (tp + fn)
# recall

In [234]:
# # rate of correct predictions of op-ed articles out of all predictions

# precision = (tp) / (tp + fp)
# precision

In [235]:
#rate of correct predictions out of total predictions

accuracy = (tp + tn) / (tn + fn + tp + fp)
print(f'accuracy = {accuracy}')
recall = (tp) / (tp + fn)
print(f'recall = {recall}')
precision = (tp) / (tp + fp)
print(f'precision = {precision}')

accuracy = 0.9289855072463769
recall = 0.9057142857142857
precision = 0.9519519519519519


In [50]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
#                                               *, 
#                                               criterion='gini', 
#                                               max_depth=None, 
#                                               min_samples_split=2, 
#                                               min_samples_leaf=1, 
#                                               min_weight_fraction_leaf=0.0, 
#                                               max_features='auto', 
#                                               max_leaf_nodes=None, 
#                                               min_impurity_decrease=0.0, 
#                                               min_impurity_split=None, 
#                                               bootstrap=True, 
#                                               oob_score=False, 
#                                               n_jobs=None, 
#                                               random_state=None, 
#                                               verbose=0, 
#                                               warm_start=False, 
#                                               class_weight=None, 
#                                               ccp_alpha=0.0, 
#                                               max_samples=None)

# NLTK shit

In [59]:
nltk_2019_df = _2019.copy()

In [60]:
#define X. X is currently pandas series of unsplit strings

X_nltk = nltk_2019_df.text

In [61]:
# define y as a series of op-ed or news

y_nktk = nltk_2019_df.type_of_material

In [211]:
%%time
# vectorize y in to (1, 0) (op-ed, news)

y_nltk = vectorize_type(y_nktk)

CPU times: user 11.5 ms, sys: 4.58 ms, total: 16.1 ms
Wall time: 17.7 ms


In [112]:
#turn series into list...

corpus_nltk = list(X_nltk)

### tokenize w/ nltk

In [120]:
%%time
from nltk.tokenize import RegexpTokenizer
regex_tokenizer = nltk.RegexpTokenizer(r"\w+")
tokenized_punc = [regex_tokenizer.tokenize(article.lower())for article in corpus_nltk]

CPU times: user 14.3 s, sys: 1.96 s, total: 16.2 s
Wall time: 16.4 s


In [126]:
(len(tokenized_punc[0])) #2218

2218

In [128]:
len(tokenized_punc)

41748

In [71]:
from nltk.tokenize import word_tokenize

In [114]:
type(word_tokenize)

function

In [129]:
# %%time
# tokenized = [word_tokenize(article.lower()) for article in corpus_nltk]

# CPU times: user 4min 46s, sys: 2.39 s, total: 4min 48s
# Wall time: 4min 50s

In [130]:
# #list of list of strings. one list of strings per documents. list are various lengths around 1000

# len(tokenized[0]) #2596

In [78]:
from nltk.corpus import stopwords

## take out stop work via ntlk. does this work against sklearn when i vectorize

In [157]:
%%time
stop = set(stopwords.words('english'))
tokenized_docs = [[word for word in words if word not in stop]
            for words in tokenized_punc]

CPU times: user 4.05 s, sys: 1.09 s, total: 5.14 s
Wall time: 5.28 s


In [158]:
#hopefully this reduced the number of strings / list

len(tokenized[1])

1132

In [159]:
#docs is new tokenized, but with stop words removed

len(tokenized_docs)

41748

# stemming/lemmatization

In [160]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [161]:
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

In [162]:
# %%time
# docs_porter = [[porter.stem(word) for word in words]
#                    for words in docs]
# docs_snowball = [[snowball.stem(word) for word in words]
#                      for words in docs]
# docs_wordnet = [[wordnet.lemmatize(word) for word in words]
#                     for words in docs]

# CPU times: user 14min 59s, sys: 18.4 s, total: 15min 18s
# Wall time: 15min 26s

In [163]:
# %%time
# docs_porter = [[porter.stem(word) for word in words]
#                    for words in docs]

# CPU times: user 7min 16s, sys: 5.21 s, total: 7min 21s
# Wall time: 7min 22s

In [169]:
%%time
snowball_stemm = [[snowball.stem(word) for word in words]
                     for words in tokenized_docs]

# CPU times: user 5min 5s, sys: 5.98 s, total: 5min 11s
# Wall time: 5min 13s

CPU times: user 5min, sys: 1.24 s, total: 5min 1s
Wall time: 5min 1s


In [170]:
# %%time
# docs_wordnet = [[wordnet.lemmatize(word) for word in words]
#                     for words in docs]

# CPU times: user 1min 24s, sys: 4.5 s, total: 1min 28s
# Wall time: 1min 30s

In [171]:
# %%time
# ## Print the stemmed and lemmatized words from the first document
# print("%16s %16s %16s %16s" % ("word", "porter", "snowball", "lemmatizer"))
# for i in range(min(len(docs_porter[0]), len(docs_snowball[0]), len(docs_wordnet[0]))):
#     p, s, w = docs_porter[0][i], docs_snowball[0][i], docs_wordnet[0][i]
#     if len(set((p, s, w))) != 1:
#         print("%16s %16s %16s %16s" % (docs[0][i], p, s, w))
#         print(docs[0][i], w)


In [172]:
#docs and lemmatizer are the same?

# I choose SNOWBALL!!!! to sklearn

In [183]:
type(docs_snowball_stemm[0])

list

In [198]:
type(corpus_nltk)

list

In [200]:
type(corpus_nltk[0])

str

In [202]:
# snowball = SnowballStemmer('english')
# snowball_tokenized = [snowball.stem(word) for word in word_tokenize(doc.lower())]

In [208]:
%%time
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

def snowball_tokenize(doc):
    snowball = SnowballStemmer('english')
    return [snowball.stem(word) for word in word_tokenize(doc.lower())]

CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 16 µs


In [239]:
%%time
#create vectorizer

vectorizer = TfidfVectorizer(#input='content', 
#                 encoding='utf-8', 
#                 decode_error='strict', 
                 strip_accents='ascii', 
                 lowercase=True, 
#                 preprocessor=None, 
                 tokenizer=snowball_tokenize, 
#                 analyzer='word', 
                 stop_words='english', 
#                 token_pattern='(?u)\b\w\w+\b', 
#                 ngram_range=(1, 1), 
#                 max_df=1.0, 
#                 min_df=1, 
                 max_features=10000, 
#                 vocabulary=None, 
#                 binary=False, 
#                 dtype=<class 'numpy.float64'>, 
#                 norm='l2', 
#                 use_idf=True, 
#                 smooth_idf=True, 
#                 sublinear_tf=False
)



CPU times: user 45 µs, sys: 1 µs, total: 46 µs
Wall time: 47.9 µs


In [255]:
# %%time
# X_snowball = vectorizer.fit_transform(corpus_nltk)

# CPU times: user 12min 37s, sys: 2.61 s, total: 12min 39s
# Wall time: 12min 31s w/ 10,000 features

In [241]:
%%time
#balance the classes

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

CPU times: user 44 µs, sys: 191 µs, total: 235 µs
Wall time: 240 µs


In [242]:
%%time
#X, y --> X_resampled, y_resampled
X_resampled, y_resampled = rus.fit_resample(X_snowball, y_nltk)
print(sorted(Counter(y_resampled).items()))

[(0, 4139), (1, 4139)]
CPU times: user 22.5 ms, sys: 11.7 ms, total: 34.2 ms
Wall time: 32.7 ms


In [243]:
%%time
#test, train, split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=50)

CPU times: user 10.4 ms, sys: 11.3 ms, total: 21.8 ms
Wall time: 20.4 ms


In [244]:
%%time

rf_clf = RandomForestClassifier(max_depth=2, random_state=0)

CPU times: user 54 µs, sys: 0 ns, total: 54 µs
Wall time: 57.2 µs


In [245]:
%%time

rf_clf.fit(X_train, y_train)

CPU times: user 414 ms, sys: 32.7 ms, total: 446 ms
Wall time: 446 ms


RandomForestClassifier(max_depth=2, random_state=0)

In [246]:
y_pred = rf_clf.predict(X_test)

In [247]:
#clf.score = accuracy = 'true'(pos/neg) / total

rf_clf.score(X_test, y_test)

0.9516908212560387

In [254]:
tn, fp, fn, tp = confusion_matrix(y_true = y_test, y_pred = y_pred).ravel()
(tn, fp, fn, tp)
metrics_(tn, fp, fn, tp)
print(f'tn={tn}, fp={fp}, fn={fn}, tp={tp})')

accuracy = 0.9516908212560387
recall = 0.9095238095238095
precision = 0.9947916666666666
tn=1015, fp=5, fn=95, tp=955)


In [249]:
nltk_features = vectorizer.get_feature_names()

In [250]:
nltk_stop = vectorizer.get_stop_words()

In [258]:
type(nltk_features)

list

In [252]:
len(nltk_stop)

318

In [256]:
X_snowball.shape

(41748, 10000)

In [259]:
# visualizing the bag of words
columns = nltk_features
df = pd.DataFrame(X_snowball, columns=columns); df

ValueError: Shape of passed values is (41748, 1), indices imply (41748, 10000)