In [19]:
import pandas as pd
import numpy as np

import string, re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split


from nltk.stem import WordNetLemmatizer 

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_json('news.json',lines=True)

In [3]:
df.head(1)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26


In [4]:
# list of all categories that the articles are categorized by
categories = list(df.category.unique())

In [5]:
categories

['CRIME',
 'ENTERTAINMENT',
 'WORLD NEWS',
 'IMPACT',
 'POLITICS',
 'WEIRD NEWS',
 'BLACK VOICES',
 'WOMEN',
 'COMEDY',
 'QUEER VOICES',
 'SPORTS',
 'BUSINESS',
 'TRAVEL',
 'MEDIA',
 'TECH',
 'RELIGION',
 'SCIENCE',
 'LATINO VOICES',
 'EDUCATION',
 'COLLEGE',
 'PARENTS',
 'ARTS & CULTURE',
 'STYLE',
 'GREEN',
 'TASTE',
 'HEALTHY LIVING',
 'THE WORLDPOST',
 'GOOD NEWS',
 'WORLDPOST',
 'FIFTY',
 'ARTS',
 'WELLNESS',
 'PARENTING',
 'HOME & LIVING',
 'STYLE & BEAUTY',
 'DIVORCE',
 'WEDDINGS',
 'FOOD & DRINK',
 'MONEY',
 'ENVIRONMENT',
 'CULTURE & ARTS']

### Text Classfication
Going to use the headline and the short description to try and predict the category.

In [6]:
df.headline[0]

'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV'

In [7]:
df.short_description[0]

'She left her husband. He killed their children. Just another day in America.'

Going to use the headline and the short description as the same importance. Combining the two will then be used to try and predict the category that it belongs to based on the words used.

In [8]:
# adding a period at the end of the headline to be combined
#   with the short description
df.headline = df.headline+'. '
# combining the two together
df['text'] = df.headline + df.short_description

In [9]:
print(df.text[0])

There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV. She left her husband. He killed their children. Just another day in America.


In [10]:
data = df.text
target = df.category

Now to get the more important words in the descriptions I am going to remove the stop words. Stop words are words like and, or, the, and many more that do not give any 'information' on what that article title/short description is describing.

If the word 'gun' appears in an article title then the article is most likely about crime and not entertainment. If the word 'baseball' appears in an article then it is most likely about sports. There are interesting words like 'court' where it could be referring to a basketball court or a court where trails in criminal and civil cases are conducted.

In [11]:
# going to filter out stopwords and punctions to get root words
stop_words = stopwords.words('english')
stop_words += list(string.punctuation)
stop_words = set(stop_words)

In [12]:
# going to remove the stopwords from the data
# splitting data into tokens meaning a list of individual words
def remove_stopwords(article_text):
    '''
    INPUT: Text of any kind
    OUTPUT: Text with all stop words removed
    '''
    tokens = nltk.word_tokenize(article_text)
    article_without_stopwords = [token.lower() for token in tokens if token.lower() not in stop_words]
    return article_without_stopwords
    
data_wo_stopwords = list(map(remove_stopwords,data))

In [13]:
print('Normal:')
print(data[0]+'\n')
print('Tokenized + Removed Stop Words:')
print(data_wo_stopwords[0])

Normal:
There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV. She left her husband. He killed their children. Just another day in America.

Tokenized + Removed Stop Words:
['2', 'mass', 'shootings', 'texas', 'last', 'week', '1', 'tv', 'left', 'husband', 'killed', 'children', 'another', 'day', 'america']


In [14]:
# the total vocab is a list of all words used in every article
total_vocab = set()
for word in data_wo_stopwords:
    total_vocab.update(word)
len(total_vocab)

118709

Going to lemmatize the text data in order to reduce the total vocab. Lemmatization is the process of removing different conguations of the same word. Organize, organizes, and organizing are all the same word but with different endings to describe if it is happening and when it is happening. The process of lemmatization will remove these so that all 3 words will transform just into organize and will be treated the same.

In [15]:
lemmatized_data = []

lemmatizer = WordNetLemmatizer()

for article in data_wo_stopwords:
    lemmatized_article = ' '.join([lemmatizer.lemmatize(word) for word in article])
    lemmatized_data.append(lemmatized_article)

In [16]:
print('Normal:')
print(data[0],'\n')
print('Tokenized + Removed Stop Words:')
print(data_wo_stopwords[0],'\n')
print('Lemmatized:')
print(lemmatized_data[0])

Normal:
There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV. She left her husband. He killed their children. Just another day in America. 

Tokenized + Removed Stop Words:
['2', 'mass', 'shootings', 'texas', 'last', 'week', '1', 'tv', 'left', 'husband', 'killed', 'children', 'another', 'day', 'america'] 

Lemmatized:
2 mass shooting texas last week 1 tv left husband killed child another day america


In [17]:
X = lemmatized_data
y = target

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=99)

In [21]:
tfidf = TfidfVectorizer()
# creating the tfidf which will train off the documents bsased on 'important' words
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)

In [22]:
tfidf_train

<160682x71956 sparse matrix of type '<class 'numpy.float64'>'
	with 2591387 stored elements in Compressed Sparse Row format>

From this we can see that this is an extremely sparse matrix meaning there are many 0 values meaning that words are only in a few amount of documents.

In [24]:
non_zero_cols = tfidf_train.nnz / float(tfidf_train.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tfidf_train.shape[1]))
print('Percentage of columns containing ZERO: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Articles: 16.127425598386875
Percentage of columns containing ZERO: 0.9997758710100841


Going to look at what words are associated with each category. These are the words that are most frequent within the documents of each category. Just for example going to do 5.

In [26]:
# isolating categories
df_crime = df[df.category == 'CRIME']
df_entertainment = df[df.category == 'ENTERTAINMENT']
df_sports = df[df.category == 'SPORTS']
df_tech = df[df.category == 'TECH']
df_politics = df[df.category == 'POLITICS']

In [27]:
# isolating text from the articles
crime_data = df_crime.text
entertainment_data = df_entertainment.text
sports_data = df_sports.text
tech_data = df_tech.text
politics_data = df_politics.text

In [29]:
# this is the the same removing of stop words that we did before
crime_clean = list(map(remove_stopwords,crime_data))
entertainment_clean = list(map(remove_stopwords,entertainment_data))
sports_clean = list(map(remove_stopwords,sports_data))
tech_clean = list(map(remove_stopwords,tech_data))
politics_clean = list(map(remove_stopwords,politics_data))

In [34]:
# data is now in a list of lists and to look at words 
#. frequency we need a flat list of n x 1
crime_flat = [item for sublist in crime_clean for item in sublist]
entertainment_flat = [item for sublist in entertainment_clean for item in sublist]
sports_flat = [item for sublist in sports_clean for item in sublist]
tech_flat = [item for sublist in tech_clean for item in sublist]
politics_flat = [item for sublist in politics_clean for item in sublist]

In [None]:
# now we will use frequency distribution to identify the w