This notebook is used to preprocess the dataset for input into our ML models using spacy for tokenization, lemmatization, and PoS tagging.

In [3]:
from __future__ import unicode_literals, print_function

import tld
import spacy
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import utils

spacy.load('en')

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
%autoreload
import utils

### Pre-pre-processing
Read and clean article text for the types of articles we care about, namely those Alchemy has labeled as political, and that we have labeled as either "conservative" or "liberal".  We'll also extract top-level domain for later.  The output here is a pandas dataframe.

In [4]:
# Read data, excluding satirical because there are too few samples.

datadir = '../../news-crawler/data/articles/'
files = utils.get_file_list(datadir, exclude_regex='.*satirical')

print('Number of articles: {}'.format(len(files)))

Number of articles: 67292


In [5]:
%%time

# Use multiprocessing to pre-pre-process.

df = utils.create_dataframe(files)
print(df.shape)

(67292, 3)
CPU times: user 231 ms, sys: 89.2 ms, total: 320 ms
Wall time: 51 s


In [6]:
# Take a peek!

df.head()

Unnamed: 0,title,label,url
0,COMMENTARY: Restoring Freedom: Now or Never (A...,conservative,http://dailysurge.com/2016/12/commentary-resto...
1,COMMENTARY: Welcoming Back the American Dream ...,conservative,http://dailysurge.com/2016/12/commentary-welco...
2,COMMENTARY: The 28th Amendment » DailySurge,conservative,http://dailysurge.com/2016/11/commentary-28th-...
3,COMMENTARY: A Return to the Rule of Law » Dail...,conservative,http://dailysurge.com/2016/11/commentary-retur...
4,COMMENTARY: When Is Trump Going to the Toilet ...,conservative,http://dailysurge.com/2016/11/commentary-trump...


In [7]:
# Extract top-level domain for later.

df['domain'] = df['url'].map(tld.get_tld)
df.head()

Unnamed: 0,title,label,url,domain
0,COMMENTARY: Restoring Freedom: Now or Never (A...,conservative,http://dailysurge.com/2016/12/commentary-resto...,dailysurge.com
1,COMMENTARY: Welcoming Back the American Dream ...,conservative,http://dailysurge.com/2016/12/commentary-welco...,dailysurge.com
2,COMMENTARY: The 28th Amendment » DailySurge,conservative,http://dailysurge.com/2016/11/commentary-28th-...,dailysurge.com
3,COMMENTARY: A Return to the Rule of Law » Dail...,conservative,http://dailysurge.com/2016/11/commentary-retur...,dailysurge.com
4,COMMENTARY: When Is Trump Going to the Toilet ...,conservative,http://dailysurge.com/2016/11/commentary-trump...,dailysurge.com


In [8]:
df.to_pickle('../data/titles.pkl')

### Pre-processing using spacy NLP

__Lemmatize the text and tag words with their part of speech.__

In [9]:
df = pd.read_pickle('../data/titles.pkl')
df.shape

(67292, 4)

In [13]:
# We'll keep article if it has at least min_sents sentences.
min_sents = 0     

# Whether to exclude stopwords.
keep_stops = True

In [14]:
df.head()

Unnamed: 0,title,label,url,domain
0,COMMENTARY: Restoring Freedom: Now or Never (A...,conservative,http://dailysurge.com/2016/12/commentary-resto...,dailysurge.com
1,COMMENTARY: Welcoming Back the American Dream ...,conservative,http://dailysurge.com/2016/12/commentary-welco...,dailysurge.com
2,COMMENTARY: The 28th Amendment » DailySurge,conservative,http://dailysurge.com/2016/11/commentary-28th-...,dailysurge.com
3,COMMENTARY: A Return to the Rule of Law » Dail...,conservative,http://dailysurge.com/2016/11/commentary-retur...,dailysurge.com
4,COMMENTARY: When Is Trump Going to the Toilet ...,conservative,http://dailysurge.com/2016/11/commentary-trump...,dailysurge.com


In [15]:
%%time

# Remove domain identifiers from the text.

df = utils.remove_hints(df)

cnn.com,7610,
usatoday.com,4817,
cbsnews.com,4576,
go.com,4393,Video:
go.com,4393,
washingtonpost.com,4030,
breitbart.com,3740,- Breitbart
breitbart.com,3740,
nytimes.com,3711,
cnbc.com,3185,
newsmax.com,2530,
foxnews.com,2436,
rightwingnews.com,2356,| John Hawkins' Right Wing News
rightwingnews.com,2356,
washingtonexaminer.com,2040,
realclearpolitics.com,2002,| RealClearPolitics
realclearpolitics.com,2002,
westernjournalism.com,1960,Just
westernjournalism.com,1960,Trump
westernjournalism.com,1960,
washingtontimes.com,1712,
dailywire.com,1537,
weeklystandard.com,1496,
reuters.com,1463,
theatlantic.com,1259,
huffingtonpost.com,1215,
nypost.com,1144,
ijr.com,856,to
ijr.com,856,
ap.org,841,Associated Press
ap.org,841,
ap.org,841,
ap.org,841,
ap.org,841,
ap.org,841,
ap.org,841,
thehill.com,812,
newsbusters.org,772,
twitchy.com,771,
freebeacon.com,749,
americanthinker.com,742,Articles:
americanthinker.com,742,
slate.com,673,the
slate.com,673,
wsj.com,637,
motherjones.com,490,
cbslocal.com,1

In [16]:
# Some of the articles are empty, let's remove them.

df = df.drop(df.index[np.where((df['tokenized'] == '') | (df['tokenized'].isnull()))[0]])
df.index = range(df.shape[0])  # Need to re-index again.
df.shape

(66046, 5)

In [17]:
%%time

# Tokenize the text.

df['tokenized'] = utils.parse_docs(list(df['tokenized']), keep_stops, min_sents)

processing 66046 docs
CPU times: user 177 ms, sys: 50 ms, total: 227 ms
Wall time: 9.25 s


__Next let's encode the entire corpus into some vocab data structures.__

In [24]:
%%time

# Extract the vocabulary and related data structures for encoding/decoding the corpus.

vocab_list, vocab_word2idx, vocab_idx2word = utils.create_vocab(df['tokenized'].tolist())

dictionary size: 34414
CPU times: user 510 ms, sys: 11.5 ms, total: 521 ms
Wall time: 515 ms


In [25]:
# Check out the top 10 words

vocab_list[:10]

[(u'trump_PROPN', 17496),
 (u'the_DET', 12820),
 (u'in_ADP', 12107),
 (u'of_ADP', 10150),
 (u'to_PART', 9985),
 (u'be_VERB', 9851),
 (u'-PRON-_PRON', 9181),
 (u'for_ADP', 8150),
 (u'on_ADP', 7357),
 (u'a_DET', 7116)]

In [26]:
%%time

# Encode the corpus.

df['encoded_text'] = df['tokenized'].map(lambda x: [vocab_word2idx[y] for y in x.split()])

CPU times: user 278 ms, sys: 30.2 ms, total: 308 ms
Wall time: 300 ms


In [27]:
# Encode the labels.

df['encoded_label'] = LabelEncoder().fit_transform([x for x in df['label']])
df.head()

Unnamed: 0,title,label,url,domain,tokenized,encoded_text,encoded_label
0,COMMENTARY: Restoring Freedom: Now or Never (A...,conservative,http://dailysurge.com/2016/12/commentary-resto...,dailysurge.com,restoring_PROPN freedom_NOUN now_ADV or_CCONJ ...,"[20507, 1536, 103, 165, 393, 11, 35, 4, 22, 6]",0
1,COMMENTARY: Welcoming Back the American Dream ...,conservative,http://dailysurge.com/2016/12/commentary-welco...,dailysurge.com,welcome_VERB back_ADV american_ADJ dream_PROPN,"[1359, 86, 228, 3031]",0
2,COMMENTARY: The 28th Amendment » DailySurge,conservative,http://dailysurge.com/2016/11/commentary-28th-...,dailysurge.com,the_DET amendment_PROPN,"[1, 2866]",0
3,COMMENTARY: A Return to the Rule of Law » Dail...,conservative,http://dailysurge.com/2016/11/commentary-retur...,dailysurge.com,a_DET return_NOUN to_ADP rule_PROPN of_ADP law...,"[9, 553, 10, 2106, 3, 400]",0
4,COMMENTARY: When Is Trump Going to the Toilet ...,conservative,http://dailysurge.com/2016/11/commentary-trump...,dailysurge.com,when_ADV be_VERB trump_PROPN go_VERB to_ADP to...,"[129, 5, 0, 49, 10, 6509, 11, 165, 149, 10886]",0


### Write the data

In [28]:
%%time

_ = utils.write_dataset('../data/data', df, keep_stops, min_sents, vocab_list, vocab_word2idx, vocab_idx2word)

wrote to ../data/data-True-0.pkl
CPU times: user 3.48 s, sys: 89.5 ms, total: 3.57 s
Wall time: 3.58 s
