In [73]:
import pandas as pd
import numpy as np

import string, re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from nltk.stem import WordNetLemmatizer 

import warnings
warnings.filterwarnings('ignore')

In [47]:
df = pd.read_json('news.json',lines=True)

In [48]:
df.head(1)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26


In [49]:
# list of all categories that the articles are categorized by
categories = list(df.category.unique())

In [70]:
categories

['CRIME',
 'ENTERTAINMENT',
 'WORLD NEWS',
 'IMPACT',
 'POLITICS',
 'WEIRD NEWS',
 'BLACK VOICES',
 'WOMEN',
 'COMEDY',
 'QUEER VOICES',
 'SPORTS',
 'BUSINESS',
 'TRAVEL',
 'MEDIA',
 'TECH',
 'RELIGION',
 'SCIENCE',
 'LATINO VOICES',
 'EDUCATION',
 'COLLEGE',
 'PARENTS',
 'ARTS & CULTURE',
 'STYLE',
 'GREEN',
 'TASTE',
 'HEALTHY LIVING',
 'THE WORLDPOST',
 'GOOD NEWS',
 'WORLDPOST',
 'FIFTY',
 'ARTS',
 'WELLNESS',
 'PARENTING',
 'HOME & LIVING',
 'STYLE & BEAUTY',
 'DIVORCE',
 'WEDDINGS',
 'FOOD & DRINK',
 'MONEY',
 'ENVIRONMENT',
 'CULTURE & ARTS']

### Text Classfication
Going to use the headline and the short description to try and predict the category.

In [50]:
df.headline[0]

'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV'

In [51]:
df.short_description[0]

'She left her husband. He killed their children. Just another day in America.'

Going to use the headline and the short description as the same importance. Combining the two will then be used to try and predict the category that it belongs to based on the words used.

In [52]:
# adding a period at the end of the headline to be combined
#   with the short description
df.headline = df.headline+'. '
# combining the two together
df['text'] = df.headline + df.short_description

In [53]:
print(df.text[0])

There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV. She left her husband. He killed their children. Just another day in America.


In [54]:
data = df.text
target = df.category

Now to get the more important words in the descriptions I am going to remove the stop words. Stop words are words like and, or, the, and many more that do not give any 'information' on what that article title/short description is describing.

If the word 'gun' appears in an article title then the article is most likely about crime and not entertainment. If the word 'baseball' appears in an article then it is most likely about sports. There are interesting words like 'court' where it could be referring to a basketball court or a court where trails in criminal and civil cases are conducted.

In [60]:
# going to filter out stopwords and punctions to get root words
stop_words = stopwords.words('english')
stop_words += list(string.punctuation)
stop_words = set(stop_words)

In [68]:
# going to remove the stopwords from the data
# splitting data into tokens meaning a list of individual words
def remove_stopwords(article_text):
    '''
    INPUT: Text of any kind
    OUTPUT: Text with all stop words removed
    '''
    tokens = nltk.word_tokenize(article_text)
    article_without_stopwords = [token.lower() for token in tokens if token.lower() not in stop_words]
    return article_without_stopwords
    
data_wo_stopwords = list(map(remove_stopwords,data))

In [84]:
print('Normal:')
print(data[0]+'\n')
print('Tokenized + Removed Stop Words:')
print(data_wo_stopwords[0])

Normal:
There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV. She left her husband. He killed their children. Just another day in America.

Tokenized + Removed Stop Words:
['2', 'mass', 'shootings', 'texas', 'last', 'week', '1', 'tv', 'left', 'husband', 'killed', 'children', 'another', 'day', 'america']


In [71]:
# the total vocab is a list of all words used in every article
total_vocab = set()
for word in data_wo_stopwords:
    total_vocab.update(word)
len(total_vocab)

118709

Going to lemmatize the text data in order to reduce the total vocab. Lemmatization is the process of removing different conguations of the same word. Organize, organizes, and organizing are all the same word but with different endings to describe if it is happening and when it is happening. The process of lemmatization will remove these so that all 3 words will transform just into organize and will be treated the same.

In [78]:
lemmatized_data = []

lemmatizer = WordNetLemmatizer()

for article in data_wo_stopwords:
    lemmatized_article = ' '.join([lemmatizer.lemmatize(word) for word in article])
    lemmatized_data.append(lemmatized_article)

In [87]:
print('Normal:')
print(data[0],'\n')
print('Tokenized + Removed Stop Words:')
print(data_wo_stopwords[0],'\n')
print('Lemmatized:')
print(lemmatized_data[0])

Normal:
There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV. She left her husband. He killed their children. Just another day in America. 

Tokenized + Removed Stop Words:
['2', 'mass', 'shootings', 'texas', 'last', 'week', '1', 'tv', 'left', 'husband', 'killed', 'children', 'another', 'day', 'america'] 

Lemmatized:
2 mass shooting texas last week 1 tv left husband killed child another day america


In [88]:
X = lemmatized_data
y = target