# Welcome to the NLP Terror-Drome

In [9]:
from pprint import pprint

import os
import pandas as pd
import numpy as np
import re

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
#This will get our TF-IDF score without having to find out TF or IDF on our own
from sklearn.feature_extraction.text import TfidfVectorizer
#For making the Test-Train split to measure how well the model performed
from sklearn.model_selection import train_test_split
#We're going to use the Decision Tree alogrithm to classify the content 
from sklearn.tree import DecisionTreeClassifier
#We will use these tools to measure how well we classified the categories based on the content
from sklearn.metrics import classification_report, accuracy_score

In [6]:
#We are going basic clean, like almost always
#Went with lemmatize. Don't have the a good reason to say why.
from prepare import basic_clean, lemmatize

#That's the function that gets articles from InSHort.com. We've already got a csv it's pulling from.
from acquire import get_articles

In [3]:
#This is in acquire now. There were a lot of numbers in the words/features.
def remove_numbers(text):
    text = re.sub(r"[0-9]", '', text)
    return text

In [7]:
def get_articles():
    filename = 'news_articles.csv'
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        return get_articles_by_category()

In [10]:
df = get_articles()
#The index column was annoying.
df.set_index('Unnamed: 0',inplace=True)

In [11]:
df

Unnamed: 0_level_0,author,category,content,published_date,title
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Krishna Veera Vanamali,business,After a US jury found that Elon Musk did not d...,2019-12-07T16:34:59.000Z,My faith in humanity is restored: Musk after w...
1,Krishna Veera Vanamali,business,"British cave explorer Vernon Unsworth, who los...",2019-12-07T15:31:16.000Z,I'll take it on the chin: Cave explorer after ...
2,Dharna,business,A Lucknow-based customer has filed an FIR agai...,2019-12-09T08:17:05.000Z,FIR filed against Club Factory in Lucknow for ...
3,Krishna Veera Vanamali,business,Congress leader Shashi Tharoor said he wants a...,2019-12-08T01:30:55.000Z,I want a 'New India' where Bajaj can speak fea...
4,Krishna Veera Vanamali,business,Finance Minister Nirmala Sitharaman on Saturda...,2019-12-07T12:36:59.000Z,Sitharaman hints at personal income tax rate c...
5,Pragya Swastik,business,Price of onion has shot up to ₹200 per kg in B...,2019-12-08T08:44:31.000Z,Onion prices surge up to ₹200 per kg in Bengaluru
6,Pragya Swastik,business,Former RBI Governor Raghuram Rajan in an artic...,2019-12-08T10:40:17.000Z,Ideas come from a small set of people around P...
7,Pragya Swastik,business,The Uttar Pradesh Cabinet on Monday approved Z...,2019-12-09T09:35:55.000Z,UP Cabinet approves Zurich Airport Int'l as de...
8,Pragya Swastik,business,Tata Sons Chairman Emeritus Ratan Tata in an i...,2019-12-09T10:34:12.000Z,I wanted to buy a proper piano until I saw the...
9,Krishna Veera Vanamali,business,A panel set up to increase GST collections is ...,2019-12-08T02:30:26.000Z,GST panel considers raising 5% slab to 6%: Rep...


SAMPLE BEFORE USING THE ENTIRE CORPUS OF DOCUMENTS

In [13]:
document = df.content[0]

document

'after a u jury found that elon musk did not defame british cave explorer vernon unsworth by calling him a pedo guy on twitter the tesla ceo said my faith in humanity is restored the 48yearold billionaire argued that he did not intend to call unsworth a paedophile but instead wa using pedo guy to mean creepy old guy'

In [14]:
document = basic_clean(document)

document = lemmatize(document)

words = pd.Series(document.split())

In [21]:
#The tf-idf sklearn object that will derive the tf-idf score of each word/feature. 
tfidf = TfidfVectorizer() 

In [40]:
#A little sampling to see how these modules and functions work.
just_content = df.content

just_content = just_content.apply(basic_clean)

just_content = just_content.apply(lemmatize)

just_content = just_content.apply(remove_numbers)

tfidfs = tfidf.fit_transform(just_content.values)

In [41]:
#This will be a DataFrame with each word/feature as a column.
#It'll be mostly zeros, but the ones with any sort of number have a tf-idf above zero.
#Meaning they are frequent within the document(row#) yet uncommon among the corpus(all rows)
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,aaryan,ab,abbey,ability,able,about,abstoxic,abuse,accent,accept,...,young,younger,your,yourself,yourstory,zero,zilingo,zima,zomato,zurich
0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.00000
1,0.000000,0.000000,0.000000,0.0,0.000000,0.101990,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.00000
2,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.00000
3,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.00000
4,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.00000
5,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.00000
6,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.00000
7,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.15348
8,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.142724,0.000000,0.0,0.0000,0.0,0.00000
9,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.0,0.00000


In [59]:
#I thought we had 100 rows? 20 a piece. Oh Well.
df.shape

(99, 5)

In [43]:
#Preparing the content column with our Prepare.py functions
df.content = df.content.apply(remove_numbers)
df.content = df.content.apply(basic_clean)
df.content = df.content.apply(lemmatize)

In [45]:
#X is is the Feature Matrix of TF-IDF scores per word.
X = tfidf.fit_transform(df.content)
#Target variable is the category. We want to predict each article's category by using the words/features in its content
y = df.category

In [48]:
#Split them up using the sklean-testTrain split funciton
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

In [60]:
#TWe will make fresh DataFrames for the train and test. To start we will give them the actual category values as the
#only column
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

In [89]:
#STEP 1-2 of the SK-LEARN API:
# 1_Call the object, 2_Fit the object to the training(feature matrix, target_variable)

#I have no reason to pick 4 as the max_depth. Lets try to overfit this thing.

#Max_Depth of 7 had the best test accuracy.
tree = DecisionTreeClassifier(max_depth=7).fit(X_train, y_train)

In [90]:
# 3_Transform/Predict is assigned to a column in the Train and Test dataframes. 
#They should now be 2 columns wide. ['actual','predicted']
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

Accuracy Results

    They're pretty bad!

In [91]:
#How well our model fit to the training data predicts the very training data it originated from.
#TRAIN
#Just the flat accuracy of each prediction. Correct Predictions/All Predictions
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
#A matrix that shows where the prediction compare to what they should acutally be
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
#A readout of precision and recall.
print(classification_report(train.actual, train.predicted))

Accuracy: 70.89%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
business              5              0       0           0
entertainment         0             15       2           0
sports                2              3      18           2
technology           12              2       0          18
---
               precision    recall  f1-score   support

     business       1.00      0.26      0.42        19
entertainment       0.88      0.75      0.81        20
       sports       0.72      0.90      0.80        20
   technology       0.56      0.90      0.69        20

     accuracy                           0.71        79
    macro avg       0.79      0.70      0.68        79
 weighted avg       0.79      0.71      0.68        79



Looks like our model really wants to assign everything to the technology category.

Perhaps that specific category deserves some exploration.

In [92]:
#TEST
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 45.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
predicted                                                 
entertainment         1              1       0           0
sports                1              3       4           1
technology            3              1       1           4
---
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         5
entertainment       0.50      0.20      0.29         5
       sports       0.44      0.80      0.57         5
   technology       0.44      0.80      0.57         5

     accuracy                           0.45        20
    macro avg       0.35      0.45      0.36        20
 weighted avg       0.35      0.45      0.36        20



  'precision', 'predicted', average, warn_for)


Once again, our predictions want to say everything is 'technology'

It also rarely wants to predict business. Goes heavy on the sports and technology.

# NGRAMS. Let's try them

In [93]:
from nltk import ngrams

In [94]:
document

'after a u jury found that elon musk did not defame british cave explorer vernon unsworth by calling him a pedo guy on twitter the tesla ceo said my faith in humanity is restored the 48yearold billionaire argued that he did not intend to call unsworth a paedophile but instead wa using pedo guy to mean creepy old guy'

In [106]:
bigrams = ngrams(document.split(), 2)

In [107]:
bigrams_in_document = pd.Series(bigrams)

In [108]:
bigrams_in_document

0                   (after, a)
1                       (a, u)
2                    (u, jury)
3                (jury, found)
4                (found, that)
5                 (that, elon)
6                 (elon, musk)
7                  (musk, did)
8                   (did, not)
9                (not, defame)
10           (defame, british)
11             (british, cave)
12            (cave, explorer)
13          (explorer, vernon)
14          (vernon, unsworth)
15              (unsworth, by)
16               (by, calling)
17              (calling, him)
18                    (him, a)
19                   (a, pedo)
20                 (pedo, guy)
21                   (guy, on)
22               (on, twitter)
23              (twitter, the)
24                (the, tesla)
25                (tesla, ceo)
26                 (ceo, said)
27                  (said, my)
28                 (my, faith)
29                 (faith, in)
30              (in, humanity)
31              (humanity, is)
32      