In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import regexp_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim.corpora as corpora
import gensim
from datetime import datetime, timedelta
import time
import pickle
import pprint

import warnings
warnings.filterwarnings('ignore')



In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fergu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\fergu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fergu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Lemmatize Documents

In [5]:
wnl = WordNetLemmatizer()

def func_lemmatize(words):
    lemmatized = []
    # Tag each word and 
    for word, tag in pos_tag(words):
        wntag = tag[0].lower()
        # Indlcude only adjectives, nouns verbs, adverbs (r) 
        wntag = wntag if wntag in ['a','r','n','v'] else None
        # Lemmatize
        lemma = wnl.lemmatize(word,wntag) if wntag else word
        lemmatized.append(lemma)
    return lemmatized

stop = set(stopwords.words('english'))

pattern = r'(\w+)'

### Other Lemmatizing Functions
The following were attempts to make preprocessing faster, since this was found to be the primary efficiency bottleneck. None of these seemed to have the desired effect and the original lemmatizing approach (defined above) was ultimately used.

In [6]:
def func_lemmatize(words):
    allow = ['a','r','n','v']
    
    # condense all into a list comprehension to try to speed up preprocessing
    list_2 = [wnl.lemmatize(word,wntag) if wntag else word for word, wntag in [(word,tag[0].lower() if tag[0].lower() in allow else None) for word,tag in pos_tag(words)]]
    
    return list_2

In [7]:
def func_lemmatize(words):
    allow = ['a','r','n','v']
    # condense all two list comprehensions to try to speed up preprocessing
    list_1 = [(word,tag[0].lower() if tag[0].lower() in allow else None) for word,tag in pos_tag(words)]
    list_2 = [wnl.lemmatize(word,wntag) if wntag else word for word, wntag in list_1]
    
    return list_2

In [8]:
def func_lemmatize(words):
    allow = ['a','r','n','v']
    # condense all two list comprehensions to try to speed up preprocessing
    lemmatized = [wntag if wntag[0].lower() in allow for word,tag in pog_tag(words)]
    lemma = [wnl.lemmatize(item) for item in list_1]

### Read in Data

In [9]:
# Read in the dataset
df = pd.read_csv('../all-the-news-2-1.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,year,month,day,author,title,article,url,section,publication
0,0,0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,1,1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2,2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,3,3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,4,4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [10]:
# Drop null records and duplicates
df = df.dropna(subset = ['article','title'])
df.drop_duplicates(subset=['title'],inplace=True)

In [11]:
df.reset_index(inplace=True,drop=True)

In [12]:
df.shape

(2412021, 12)

### Approach 1: Filter out Irrelevant Content

In [20]:
# Define 'dropwords', i.e. words that any article should be dropped if they contain any of them
# This is an attempt to filter out some of the more irrelevant content
dropwords = ['fitch','stock','fidget','dakota','market']

In [21]:
# Exclude article 
excl = [ind for ind in df2.index if not any([word in df2.loc[ind,'title'] for word in dropwords])]
new_ind = [ind for ind in df2.index if ind not in excl]
df2 = df2.loc[new_ind,:]
#df2.reset_index(drop=True,inplace=True)

In [22]:
len(excl)

18481

In [23]:
df1 = df[df['section'].isin(sections)]
#df1_index(drop=True,inplace=True)
#df_ht.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1,inplace=True)

In [24]:
df_ht = pd.concat([df1,df2])
df_ht.drop_duplicates(subset=['title'])
df_ht.reset_index(drop=True,inplace=True)

In [25]:
df_ht.shape

(281411, 12)

### Approach 2: Select a Desired Number of Records to Process

In [12]:
num = 250000
np.random.seed(3)
inds_to_use = np.random.choice(df.index,num,replace=False)
df2 = df.loc[inds_to_use,:]
#df2 = df.loc[:num]
df2.reset_index(drop=True,inplace=True)

In [13]:
len(df2)

250000

In [27]:
''' This function goes through the normal preprocessing steps on a pre-filtered dataset (see above) and performs the 
tokenizing, number- and special character-removal steps, lemmatizing, lower-case and stopword-removal steps.

Sample input: "We were going to go to the zoo at 09:00, but decided against it due to the rain!"
Output: ['go','go','zoo',decide,'against','due','rain']'''

# Original version
def clean(df):
    start = time.time()
    # Tokenize
    df['article_words'] = [regexp_tokenize(article,pattern) for article in df['article']]
    print('tokenizing done.')  
    # Retain only alphabetical characters. Remove all punctuation, numerical and special characters
    df['article_words'] = [[word for word in article if word.isalpha()] for article in df['article_words']]
    print('isalpha done.')
    # Lemmatize all tokens
    df['article_words'] = [func_lemmatize(article) for article in df['article_words']]
    print('lemmatizing done.')
    # Set to lower case and remove stopwords
    df['article_words'] = [[word.lower() for word in article if word.lower() not in stop] for article in df['article_words']]
    interval = round((time.time() - start)/60,2)
    print(f'That  took {interval} mins.')
    return df

In [15]:
#5:05pm
df_ht_clean = clean(df_ht)

tokenizing done.
isalpha done.
lemmatizing done.
That  took 117.94 mins.


In [16]:
df_ht_clean.to_csv('df_ht_clean.csv')