This notebook is used to preprocess the dataset for input into our ML models using spacy for tokenization, lemmatization, and PoS tagging.

In [2]:
from __future__ import unicode_literals, print_function

import tld
import spacy
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import utils

spacy.load('en')

%load_ext autoreload

In [13]:
%autoreload
import utils

### Pre-pre-processing
Read and clean article text for the types of articles we care about, namely those Alchemy has labeled as political, and that we have labeled as either "conservative" or "liberal".  We'll also extract top-level domain for later.  The output here is a pandas dataframe.

In [14]:
# Read data, excluding satirical because there are too few samples.

datadir = '../../news-crawler/data/articles/'
files = utils.get_file_list(datadir, exclude_regex='.*satirical')

print('Number of articles: {}'.format(len(files)))

Number of articles: 67292


In [16]:
%%time

# Use multiprocessing to pre-pre-process.

df = utils.create_dataframe(files)
print(df.shape)

(67292, 3)
CPU times: user 156 ms, sys: 88 ms, total: 244 ms
Wall time: 26.7 s


In [17]:
# Take a peek!

df.head()

Unnamed: 0,title,label,url
0,CNN/ORC Poll: Most Americans Want Washington C...,conservative,http://www.newsmax.com/Politics/Americans-comp...
1,Marines' Nude Photo Scandal Goes Beyond That O...,liberal,http://www.huffingtonpost.com/entry/marines-nu...
2,"Man Survives 1,500-Foot Fall off Mountain - Br...",conservative,http://www.breitbart.com/big-government/2017/0...
3,GOP health-care bill would drop addiction trea...,liberal,https://www.washingtonpost.com/news/wonk/wp/20...
4,Mansfield Timberview tops Corpus Christi Memor...,conservative,http://www.washingtontimes.com/news/2017/mar/9...


In [18]:
# Extract top-level domain for later.

df['domain'] = df['url'].map(tld.get_tld)
df.head()

Unnamed: 0,title,label,url,domain
0,CNN/ORC Poll: Most Americans Want Washington C...,conservative,http://www.newsmax.com/Politics/Americans-comp...,newsmax.com
1,Marines' Nude Photo Scandal Goes Beyond That O...,liberal,http://www.huffingtonpost.com/entry/marines-nu...,huffingtonpost.com
2,"Man Survives 1,500-Foot Fall off Mountain - Br...",conservative,http://www.breitbart.com/big-government/2017/0...,breitbart.com
3,GOP health-care bill would drop addiction trea...,liberal,https://www.washingtonpost.com/news/wonk/wp/20...,washingtonpost.com
4,Mansfield Timberview tops Corpus Christi Memor...,conservative,http://www.washingtontimes.com/news/2017/mar/9...,washingtontimes.com


In [19]:
df.to_pickle('../data/titles.pkl')

### Pre-processing using spacy NLP

__Lemmatize the text and tag words with their part of speech.__

In [20]:
df = pd.read_pickle('../data/titles.pkl')
df.shape

(67292, 4)

In [21]:
# We'll keep article if it has at least min_sents sentences.
min_sents = 0     

# Whether to exclude stopwords.
keep_stops = True

In [22]:
#%%time

# Tokenize the text.

#df['title'] = utils.parse_docs(list(df['title']), keep_stops, min_sents)

In [23]:
df.head()

Unnamed: 0,title,label,url,domain
0,CNN/ORC Poll: Most Americans Want Washington C...,conservative,http://www.newsmax.com/Politics/Americans-comp...,newsmax.com
1,Marines' Nude Photo Scandal Goes Beyond That O...,liberal,http://www.huffingtonpost.com/entry/marines-nu...,huffingtonpost.com
2,"Man Survives 1,500-Foot Fall off Mountain - Br...",conservative,http://www.breitbart.com/big-government/2017/0...,breitbart.com
3,GOP health-care bill would drop addiction trea...,liberal,https://www.washingtonpost.com/news/wonk/wp/20...,washingtonpost.com
4,Mansfield Timberview tops Corpus Christi Memor...,conservative,http://www.washingtontimes.com/news/2017/mar/9...,washingtontimes.com


In [24]:
%%time

# Remove domain identifiers from the text.

df = utils.remove_hints(df)

cnn.com,7610,
usatoday.com,4817,
cbsnews.com,4576,
go.com,4393,Video:
go.com,4393,
washingtonpost.com,4030,
breitbart.com,3740,- Breitbart
breitbart.com,3740,
nytimes.com,3711,
cnbc.com,3185,
newsmax.com,2530,
foxnews.com,2436,
rightwingnews.com,2356,| John Hawkins' Right Wing News
rightwingnews.com,2356,
washingtonexaminer.com,2040,
realclearpolitics.com,2002,| RealClearPolitics
realclearpolitics.com,2002,
westernjournalism.com,1960,Just
westernjournalism.com,1960,Trump
westernjournalism.com,1960,
washingtontimes.com,1712,
dailywire.com,1537,
weeklystandard.com,1496,
reuters.com,1463,
theatlantic.com,1259,
huffingtonpost.com,1215,
nypost.com,1144,
ijr.com,856,to
ijr.com,856,
ap.org,841,Associated Press
ap.org,841,
ap.org,841,
ap.org,841,
ap.org,841,
ap.org,841,
ap.org,841,
thehill.com,812,
newsbusters.org,772,
twitchy.com,771,
freebeacon.com,749,
americanthinker.com,742,Articles:
americanthinker.com,742,
slate.com,673,the
slate.com,673,
wsj.com,637,
motherjones.com,490,
cbslocal.com,1

In [25]:
# Some of the articles are empty, let's remove them.

df = df.drop(df.index[np.where((df['tokenized'] == '') | (df['tokenized'].isnull()))[0]])
df.index = range(df.shape[0])  # Need to re-index again.
df.shape

(66046, 5)

In [26]:
%%time

# Tokenize the text.

df['tokenized'] = utils.parse_docs(list(df['tokenized']), keep_stops, min_sents)

processing 66046 docs
CPU times: user 120 ms, sys: 96 ms, total: 216 ms
Wall time: 5.05 s


__Next let's encode the entire corpus into some vocab data structures.__

In [27]:
%%time

# Extract the vocabulary and related data structures for encoding/decoding the corpus.

vocab_list, vocab_word2idx, vocab_idx2word = utils.create_vocab(df['tokenized'].tolist())

dictionary size: 34414
CPU times: user 368 ms, sys: 68 ms, total: 436 ms
Wall time: 338 ms


In [28]:
# Check out the top 10 words

vocab_list[:10]

[(u'trump_PROPN', 17496),
 (u'the_DET', 12820),
 (u'in_ADP', 12107),
 (u'of_ADP', 10150),
 (u'to_PART', 9985),
 (u'be_VERB', 9851),
 (u'-PRON-_PRON', 9181),
 (u'for_ADP', 8150),
 (u'on_ADP', 7357),
 (u'a_DET', 7116)]

In [29]:
%%time

# Encode the corpus.

df['encoded_text'] = df['tokenized'].map(lambda x: [vocab_word2idx[y] for y in x.split()])

CPU times: user 200 ms, sys: 24 ms, total: 224 ms
Wall time: 190 ms


In [30]:
# Encode the labels.

df['encoded_label'] = LabelEncoder().fit_transform([x for x in df['label']])
df.head()

Unnamed: 0,title,label,url,domain,tokenized,encoded_text,encoded_label
0,CNN/ORC Poll: Most Americans Want Washington C...,conservative,http://www.newsmax.com/Politics/Americans-comp...,newsmax.com,cnn_PROPN orc_PROPN poll_PROPN most_ADJ americ...,"[34, 3435, 986, 666, 187, 59, 122, 7407]",0
1,Marines' Nude Photo Scandal Goes Beyond That O...,liberal,http://www.huffingtonpost.com/entry/marines-nu...,huffingtonpost.com,marines_PROPN nude_PROPN photo_PROPN scandal_N...,"[1714, 7665, 2021, 399, 49, 2014, 107, 78, 484...",1
2,"Man Survives 1,500-Foot Fall off Mountain - Br...",conservative,http://www.breitbart.com/big-government/2017/0...,breitbart.com,man_NOUN survive_VERB fall_PROPN off_ADP mount...,"[42, 1252, 2481, 382, 5856]",0
3,GOP health-care bill would drop addiction trea...,liberal,https://www.washingtonpost.com/news/wonk/wp/20...,washingtonpost.com,gop_PROPN health_NOUN care_NOUN bill_NOUN woul...,"[38, 62, 79, 71, 121, 303, 3848, 2291, 4847, 1...",1
4,Mansfield Timberview tops Corpus Christi Memor...,conservative,http://www.washingtontimes.com/news/2017/mar/9...,washingtontimes.com,mansfield_PROPN timberview_PROPN top_VERB corp...,"[19105, 25480, 2780, 5108, 4961, 4154, 2, 9033]",0


### Write the data

In [31]:
%%time

_ = utils.write_dataset('../data/data', df, keep_stops, min_sents, vocab_list, vocab_word2idx, vocab_idx2word)

wrote to ../data/data-True-0.pkl
CPU times: user 1.76 s, sys: 40 ms, total: 1.8 s
Wall time: 1.81 s
