This notebook is used to preprocess the dataset for input into our ML models using spacy for tokenization, lemmatization, and PoS tagging.

In [1]:
from collections import Counter
import os
import json
import re
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp
from string import printable
import spacy
import tld
from sklearn.preprocessing import LabelEncoder

## Create a dataframe from the text files

In [2]:
# Helper functions.

def get_file_list(datadir, exclude_regex=None):
  '''Get a list of the data files.'''
  files = []
  for dirpath, dirnames, filenames in os.walk(datadir):
    if len(filenames) > 0:
      files.extend([os.path.join(dirpath, x) for x in filenames])
  if exclude_regex:
    files = [x for x in files if not re.match(exclude_regex, x)]
  return files
      
def clean_string(string):
  '''Simple preprocessing to remove non-printable characters and excess whitespace.'''
  string = re.sub('\s+', ' ', string)
  string = ''.join([s for s in string if s in printable])
  return string

def create_dataframe(files=None):
  '''Create a dataframe from a file list, filtering out non-political articles.'''
  
  df = pd.DataFrame(columns=['text', 'label', 'url'], data=np.chararray((len(files), 3)))
  
  row = 0
  for filename in files:
    
    # Open file.
    with open(filename, 'r') as f:
      data = json.load(f)
    
    # Skip if no taxonomy labels.
    if len(data['taxonomy']) == 0:
      continue

    # Get taxonomy labels and filter on "politics", skipping if none exist.
    labels = [data['taxonomy'][i]['label'] for i in range(len(data['taxonomy']))]
    labels = [x for x in labels if re.match('.*politics', x)]
    if len(labels) == 0:
      continue

    # Populate row, doing basic cleaning of whitespace and non-printable characters
    # in the article text.
    df.loc[row] = [clean_string(data['text']), data['label'], data['url']]

    # Keeping track of the last row we populated.
    row += 1

  # Drop empty rows at tale of dataframe.
  df = df.drop(df.index[row:])
  
  return df

def create_dataframe_mp(files):
  return create_dataframe(files=files)


In [3]:
# Get a list of target files, excluding "satirical" labels.

files = get_file_list('../../news-crawler/data/articles/', exclude_regex='.*satirical')
print('Number of conservative/liberal articles: {}'.format(len(files)))

Number of conservative/liberal articles: 38162


In [4]:
# Break up the file list into cpu_count chunks.

files = [ map(str, x) for x in np.array_split(files, mp.cpu_count())]

In [5]:
%%time

# Use multiprocessing to read the files into a dataframe.  We'll filter out
# those articles not labeled with taxonomy "politics".

pool = mp.Pool(mp.cpu_count())

df = pool.map(create_dataframe_mp, files)

pool.close()
pool.join()

CPU times: user 342 ms, sys: 191 ms, total: 533 ms
Wall time: 24.2 s


In [6]:
# Concatenate multiprocessing results into a single dataframe.

df = pd.concat(df, axis=0, ignore_index=True)
print('Dataframe shape: {}'.format(df.shape))

Dataframe shape: (24115, 3)


In [7]:
# Extract top-level domain for later...

df['domain'] = df['url'].map(tld.get_tld)

In [8]:
df.head()

Unnamed: 0,text,label,url,domain
0,Big government has been crushing the United St...,conservative,http://dailysurge.com/2016/12/commentary-resto...,dailysurge.com
1,During the eight years of the Obama administra...,conservative,http://dailysurge.com/2016/12/commentary-welco...,dailysurge.com
2,We are witnessing the rise of a new right whic...,conservative,http://dailysurge.com/2016/11/commentary-28th-...,dailysurge.com
3,If theres one thing that Americans find intole...,conservative,http://dailysurge.com/2016/11/commentary-retur...,dailysurge.com
4,What is Airbnb? Airbnb is an online marketplac...,conservative,http://dailysurge.com/2016/11/commentary-airbn...,dailysurge.com


## Parse the data using `spacy` NLP

In [9]:
# Load spacy's NLP model for English.
NLP = spacy.load('en')

In [10]:
# We'll keep article if it has at least min_sents sentences.
min_sents = 3     

# Whether to exclude stopwords.
exclude_stops = False

# In dumb mode we don't lemmatize or otherwise filter the sequences.
dumb = True

In [11]:
# Helper function to parse a document.

def parse_doc(doc, exclude_stops=False, min_sents=3, dump=False):
  '''Return text containing only lemmatized, alphanumeric tokens with POS tag.

  Args:
    doc: spacy parsed doc
    min_sents:  minimum number of sentences to parse, else return ''

  Returns parsed string with appended PoS tags.
  '''
  
  def token_formatter(token):
    return '{}_{}'.format(x.lemma_, x.pos_)
  
  # Parse the doc.
  doc = NLP(doc)
  
  # Check that document has at least min_sents sentences.
  num_sents = len([sent for sent in doc.sents])
  if num_sents < min_sents:
    return ''
  
  # Keep alphanumeric, lemmatized tokens with PoS tags.
  if dumb:
    text = [str(x).lower() for x in doc]
  elif exclude_stops:
    text = [token_formatter(x) for x in doc if x.is_alpha and not x.is_stop]
  else:
    text = [token_formatter(x) for x in doc if x.is_alpha]
    
  return ' '.join(text)

def parse_doc_mp(args):
  '''Helper function for multiprocessing.'''
  return parse_doc(args[0], args[1])


In [12]:
# Create the arg list for multiprocessing.

args = zip(df['text'].tolist(), [exclude_stops]*df.shape[0], [min_sents]*df.shape[0],
           [dumb]*df.shape[0])

In [14]:
%%time

# Parse the documents using multiprocessing pool.  In my experiments this was 5x
# faster than using NLP.pipe (?).

pool = mp.Pool()

df['tokenized_text'] = pool.map(parse_doc_mp, args)

pool.close()
pool.join()

CPU times: user 474 ms, sys: 284 ms, total: 758 ms
Wall time: 3min 45s


In [15]:
df.head()

Unnamed: 0,text,label,url,domain,tokenized_text
0,Big government has been crushing the United St...,conservative,http://dailysurge.com/2016/12/commentary-resto...,dailysurge.com,big government has been crushing the united st...
1,During the eight years of the Obama administra...,conservative,http://dailysurge.com/2016/12/commentary-welco...,dailysurge.com,during the eight years of the obama administra...
2,We are witnessing the rise of a new right whic...,conservative,http://dailysurge.com/2016/11/commentary-28th-...,dailysurge.com,we are witnessing the rise of a new right whic...
3,If theres one thing that Americans find intole...,conservative,http://dailysurge.com/2016/11/commentary-retur...,dailysurge.com,if there s one thing that americans find intol...
4,What is Airbnb? Airbnb is an online marketplac...,conservative,http://dailysurge.com/2016/11/commentary-airbn...,dailysurge.com,what is airbnb ? airbnb is an online marketpla...


In [16]:
# Drop rows where the text was too short and we got a '' from the parse.

df = df.drop(df.index[np.where(df['tokenized_text'] == '')[0]])
df.index = range(df.shape[0])  # Need to re-index again.
df.shape

(23452, 5)

In [17]:
%%time

# Build a dictionary with word counts.

c = Counter()
for row in df.iterrows():
  c.update(row[1]['tokenized_text'].split())
  
print('Dictionary size: {}'.format(len(c)))

Dictionary size: 132979
CPU times: user 8.02 s, sys: 28.2 ms, total: 8.04 s
Wall time: 8.03 s


In [18]:
# Create a list object from the counter and sort by count.

vocab_list = [(k, v) for k,v in c.iteritems()]
vocab_list.sort(key=lambda x: -1 * x[1])

print('Top 10 vocab words:')
print(vocab_list[:10])

Top 10 vocab words:
[('the', 878245), (',', 786639), ('.', 662752), ('to', 423195), ('of', 380543), ('and', 355242), ('a', 330111), ('in', 292296), ('that', 212405), ('"', 156619)]


In [19]:
# Create word:index and index:word dicts for encoding/decoding.

vocab_word2idx = {x[0]: ix for ix,x in enumerate(vocab_list)}
vocab_idx2word = {v: k for k,v in vocab_word2idx.iteritems()}

In [20]:
%%time

# Encode the corpus.

df['encoded_text'] = df['tokenized_text'].map(lambda x: [vocab_word2idx[y] for y in x.split()])
df.head()

CPU times: user 3.04 s, sys: 60.1 ms, total: 3.1 s
Wall time: 3.1 s


In [21]:
# Encode the labels.

df['encoded_label'] = LabelEncoder().fit_transform([x for x in df['label']])
df.head()

Unnamed: 0,text,label,url,domain,tokenized_text,encoded_text,encoded_label
0,Big government has been crushing the United St...,conservative,http://dailysurge.com/2016/12/commentary-resto...,dailysurge.com,big government has been crushing the united st...,"[384, 100, 31, 49, 9784, 0, 101, 80, 10, 0, 34...",0
1,During the eight years of the Obama administra...,conservative,http://dailysurge.com/2016/12/commentary-welco...,dailysurge.com,during the eight years of the obama administra...,"[112, 0, 703, 102, 4, 0, 72, 99, 1, 39, 5270, ...",0
2,We are witnessing the rise of a new right whic...,conservative,http://dailysurge.com/2016/11/commentary-28th-...,dailysurge.com,we are witnessing the rise of a new right whic...,"[39, 30, 9394, 0, 1429, 4, 6, 57, 164, 60, 1, ...",0
3,If theres one thing that Americans find intole...,conservative,http://dailysurge.com/2016/11/commentary-retur...,dailysurge.com,if there s one thing that americans find intol...,"[63, 56, 150, 51, 478, 8, 230, 482, 10147, 174...",0
4,What is Airbnb? Airbnb is an online marketplac...,conservative,http://dailysurge.com/2016/11/commentary-airbn...,dailysurge.com,what is airbnb ? airbnb is an online marketpla...,"[55, 13, 7459, 91, 7459, 13, 36, 780, 6163, 8,...",0


In [22]:
tag = 'dumb'

# Save the dataframe.
df.to_pickle('final-dataframe-{}.pkl'.format(tag))

# Save the vocabulary dictionaries.
to_pkl = {
  'word2idx': vocab_word2idx,
  'idx2word': vocab_idx2word,
  'ranked_list': vocab_list
}

with open('final-vocab-{}.pkl'.format(tag), 'w') as f:
  pickle.dump(to_pkl, f)

In [28]:
df.iloc[4, :]

text              What is Airbnb? Airbnb is an online marketplac...
label                                                  conservative
url               http://dailysurge.com/2016/11/commentary-airbn...
domain                                               dailysurge.com
tokenized_text    airbnb_PROPN airbnb_PROPN online_ADJ marketpla...
encoded_text      [6401, 6401, 1974, 4446, 165, 2, 144, 1031, 13...
encoded_label                                                     0
Name: 4, dtype: object