This notebook is used to preprocess the dataset for input into our ML models using spacy for tokenization, lemmatization, and PoS tagging.

In [1]:
from __future__ import unicode_literals, print_function

import tld
import spacy
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import utils

spacy.load('en')

%load_ext autoreload

In [17]:
%autoreload
import utils

### Pre-pre-processing
Read and clean article text for the types of articles we care about, namely those Alchemy has labeled as political, and that we have labeled as either "conservative" or "liberal".  We'll also extract top-level domain for later.  The output here is a pandas dataframe.

In [3]:
# Define global parameters for pre-processing

FIELD = 'title'     # JSON field to use as input
MIN_SENTS = 1       # Minimum number of sentences needed for article to be used
KEEP_STOPS = True   # Whether to keep common stop words

# Dataframe file name
OUTPUT_FILE = '../data/{}-{}-{}.pkl'.format(FIELD, MIN_SENTS, KEEP_STOPS)

In [4]:
# Read data, excluding satirical because there are too few samples.

datadir = '../../news-crawler/data/articles/'
files = utils.get_file_list(datadir, exclude_regex='.*satirical')

print('Number of articles: {}'.format(len(files)))

Number of articles: 67292


In [5]:
%%time

# Use multiprocessing to pre-pre-process.

df = utils.create_dataframe(files, field=FIELD)

print(df.shape)

(67292, 3)
CPU times: user 552 ms, sys: 140 ms, total: 692 ms
Wall time: 38.4 s


In [7]:
# Extract top-level domain

df['domain'] = df['url'].map(tld.get_tld)
df.head()

Unnamed: 0,title,label,url,domain
0,CNN/ORC Poll: Most Americans Want Washington C...,conservative,http://www.newsmax.com/Politics/Americans-comp...,newsmax.com
1,Marines' Nude Photo Scandal Goes Beyond That O...,liberal,http://www.huffingtonpost.com/entry/marines-nu...,huffingtonpost.com
2,"Man Survives 1,500-Foot Fall off Mountain - Br...",conservative,http://www.breitbart.com/big-government/2017/0...,breitbart.com
3,GOP health-care bill would drop addiction trea...,liberal,https://www.washingtonpost.com/news/wonk/wp/20...,washingtonpost.com
4,Mansfield Timberview tops Corpus Christi Memor...,conservative,http://www.washingtontimes.com/news/2017/mar/9...,washingtontimes.com


### Pre-processing using spacy NLP

__Lemmatize the text and tag words with their part of speech.__

In [8]:
%%time

# Tokenize the text.

df['tokenized'] = utils.parse_docs(list(df[FIELD]), KEEP_STOPS, MIN_SENTS)

processing 67292 docs
CPU times: user 124 ms, sys: 48 ms, total: 172 ms
Wall time: 4.66 s


In [9]:
# Some of the articles are empty, let's remove them.

df = df.drop(df.index[np.where((df['tokenized'] == '') | (df['tokenized'].isnull()))[0]])
df.index = range(df.shape[0])  # Need to re-index again.
df.shape

(67286, 5)

__Next let's encode the entire corpus into some vocab data structures.__

In [10]:
%%time

# Extract the vocabulary and related data structures for encoding/decoding the corpus.

vocab_list, vocab_word2idx, vocab_idx2word = utils.create_vocab(df['tokenized'].tolist())

dictionary size: 34485
CPU times: user 384 ms, sys: 24 ms, total: 408 ms
Wall time: 341 ms


In [11]:
# Check out the top 10 words

vocab_list[:10]

[(u'trump_PROPN', 18382),
 (u'the_DET', 13249),
 (u'in_ADP', 12185),
 (u'of_ADP', 10240),
 (u'to_PART', 10189),
 (u'be_VERB', 9886),
 (u'-PRON-_PRON', 9203),
 (u'for_ADP', 8214),
 (u'on_ADP', 7405),
 (u'to_ADP', 7290)]

In [12]:
%%time

# Encode the corpus.

df['encoded_text'] = df['tokenized'].map(lambda x: [vocab_word2idx[y] for y in x.split()])

CPU times: user 188 ms, sys: 12 ms, total: 200 ms
Wall time: 179 ms


In [13]:
counts = df['domain'].value_counts()
counts = counts[counts >= 100]
counts.shape

(32,)

In [14]:
df = df[df['domain'].isin(counts.index)]
df.shape

(66658, 6)

In [15]:
# Encode the domain labels.

df['encoded_domain'] = LabelEncoder().fit_transform([x for x in df['domain']])
df.head()

Unnamed: 0,title,label,url,domain,tokenized,encoded_text,encoded_domain
0,CNN/ORC Poll: Most Americans Want Washington C...,conservative,http://www.newsmax.com/Politics/Americans-comp...,newsmax.com,poll_NOUN most_ADJ americans_PROPN want_VERB w...,"[458, 672, 198, 69, 130, 7443]",15
1,Marines' Nude Photo Scandal Goes Beyond That O...,liberal,http://www.huffingtonpost.com/entry/marines-nu...,huffingtonpost.com,marines_PROPN nude_PROPN photo_PROPN scandal_N...,"[1738, 7124, 1936, 415, 56, 2032, 116, 87, 507...",11
2,"Man Survives 1,500-Foot Fall off Mountain - Br...",conservative,http://www.breitbart.com/big-government/2017/0...,breitbart.com,man_NOUN survive_VERB fall_PROPN off_ADP mount...,"[51, 1218, 2506, 374, 5544, 29]",2
3,GOP health-care bill would drop addiction trea...,liberal,https://www.washingtonpost.com/news/wonk/wp/20...,washingtonpost.com,gop_PROPN health_NOUN care_NOUN bill_NOUN woul...,"[47, 72, 89, 82, 131, 305, 3865, 2176, 4881, 1...",27
4,Mansfield Timberview tops Corpus Christi Memor...,conservative,http://www.washingtontimes.com/news/2017/mar/9...,washingtontimes.com,mansfield_PROPN timberview_PROPN top_VERB corp...,"[19178, 25520, 2808, 5153, 4995, 4183, 2, 9071]",28


In [16]:
# Encode the bias labels.

df['encoded_label'] = LabelEncoder().fit_transform([x for x in df['label']])
df.head()

Unnamed: 0,title,label,url,domain,tokenized,encoded_text,encoded_domain,encoded_label
0,CNN/ORC Poll: Most Americans Want Washington C...,conservative,http://www.newsmax.com/Politics/Americans-comp...,newsmax.com,poll_NOUN most_ADJ americans_PROPN want_VERB w...,"[458, 672, 198, 69, 130, 7443]",15,0
1,Marines' Nude Photo Scandal Goes Beyond That O...,liberal,http://www.huffingtonpost.com/entry/marines-nu...,huffingtonpost.com,marines_PROPN nude_PROPN photo_PROPN scandal_N...,"[1738, 7124, 1936, 415, 56, 2032, 116, 87, 507...",11,1
2,"Man Survives 1,500-Foot Fall off Mountain - Br...",conservative,http://www.breitbart.com/big-government/2017/0...,breitbart.com,man_NOUN survive_VERB fall_PROPN off_ADP mount...,"[51, 1218, 2506, 374, 5544, 29]",2,0
3,GOP health-care bill would drop addiction trea...,liberal,https://www.washingtonpost.com/news/wonk/wp/20...,washingtonpost.com,gop_PROPN health_NOUN care_NOUN bill_NOUN woul...,"[47, 72, 89, 82, 131, 305, 3865, 2176, 4881, 1...",27,1
4,Mansfield Timberview tops Corpus Christi Memor...,conservative,http://www.washingtontimes.com/news/2017/mar/9...,washingtontimes.com,mansfield_PROPN timberview_PROPN top_VERB corp...,"[19178, 25520, 2808, 5153, 4995, 4183, 2, 9071]",28,0


### Write the data

In [18]:
%%time

_ = utils.write_dataset(OUTPUT_FILE, df, KEEP_STOPS, MIN_SENTS, vocab_list, vocab_word2idx, vocab_idx2word)

wrote to ../data/title-1-True.pkl
CPU times: user 2.13 s, sys: 28 ms, total: 2.16 s
Wall time: 2.17 s
