In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [32]:
#Common import
import numpy as np
import pandas as pd
import time
from tensorflow.keras.preprocessing.text import Tokenizer

import asyncio
import nest_asyncio
nest_asyncio.apply()

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#datasets
from standardize_datasets import get_merge_dataset
from standardize_datasets import clean_unicode
from standardize_datasets import standardize_without_theme
from standardize_datasets import remove_punctuation

In [3]:
# Imports for NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

In [4]:
#download nltk data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomasloux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/thomasloux/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thomasloux/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/thomasloux/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
#Imports for Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [6]:
df = get_merge_dataset()

# Implementing functions to preprocess using nltk or spacy

## Using NLTK

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize_nltk(text):
    # Doesn't take into account the POS
    word_tokens = word_tokenize(text, language='english')
    lemmatized_sentence = [lemmatizer.lemmatize(w) for w in word_tokens if not w in stop_words] 
    return lemmatized_sentence

## Using Spacy

In [40]:
# Using Spacy
nlp = spacy.load("en_core_web_sm")

def remove_stopwords_and_lemmatize_spacy_fast(text):
    doc = nlp(text, disable=['parser', 'ner'])
    lemmatized_sentence = [w.lemma_ for w in doc if w.text not in STOP_WORDS ]
    return lemmatized_sentence

def remove_stopwords_and_lemmatize_spacy(text):
    doc = nlp(text, disable=['parser', 'ner']) 
    lemmatized_sentence = [w.lemma_ for w in doc if w.text not in STOP_WORDS ]
    return lemmatized_sentence

def remove_stopwords_and_lemmatize_spacy_batch(batch):
    docs = nlp.pipe(batch, batch_size=1000, disable=['parser', 'ner'])
    lemmatized_sentence = [[w.lemma_ for w in doc if w.text not in STOP_WORDS] for doc in docs]
    return lemmatized_sentence

def remove_stopwords_and_lemmatize_spacy_multiprocessed(batch):
    docs = nlp.pipe(batch, batch_size=1000, n_process=4,disable=['parser', 'ner'])
    lemmatized_sentence = [[w.lemma_ for w in doc if w.text not in STOP_WORDS] for doc in docs]
    return lemmatized_sentence

# Comparing lemmatized sentences

In [19]:
example = df['article'][0]
print(example)

WASHINGTON—Saying  the sender’s contributions were appreciated but ultimately  self-defeating, Capitol Police told reporters Wednesday that a  thoughtful letter on how to improve the legislative process was undercut  by the poison powder included in the envelope. “Although the anonymous  assailant’s note had some helpful feedback on how to break through  partisan gridlock, the deadly quantity of ricin spread on those pages  also makes you wonder how much he really believes in improving the  democratic process,” said Capitol Police spokesperson Jermaine Williams,  rushing to note that the letter itself was well thought out, thoroughly  researched, and expressed important points on places where Republicans  and Democrats might come together to advance bills on pharmaceutical  costs and energy permitting reform. “What’s especially nice is how much  constructive feedback it contains. Sometimes people can get really nasty  about politics. But, aside from a few typos, blood-stains, and the  

In [None]:
pos_tag(word_tokenize(example, language='english'))

In [21]:
print(remove_stopwords_and_lemmatize_nltk(example))
print(remove_stopwords_and_lemmatize_spacy(example))

NameError: name 'get_wordnet_pos' is not defined

In [100]:
print(example)
print(remove_stopwords_and_lemmatize_nltk(example))
print(remove_stopwords_and_lemmatize_spacy(example))

#Performance of spacy seems to be better than nltk

PHOENIX—Catering to a large and valuable segment of customers who have misguided visions of what city living will be like, truck rental company U-Haul announced Wednesday that it is now offering a discount to customers who will just end up moving back home in 18 months after failing to make it in a major metropolitan area. “Beginning today, we’re taking 30 percent off our mileage rate for those who pack up their belongings, head off to a large city, give their dreams a feeble shot, and then come crawling right back to the safety and security of home within a year and a half,” said U-Haul spokesman Christine Shipley, adding that the deal would apply to all 10-foot trucks designated for moving into studios and one-bedroom apartments, most of which customers will leave behind before their lease is even up. “We are also including free furniture blankets, bubble wrap, and boxes for the items that may not even be fully unpacked during the short time it takes for our customers to be spit righ

In [57]:
_, o = standardize_without_theme()

In [59]:
o['article'][0]

'HOLLYWOOD, CA—The new Jerry Bruckheimer comedy Kangaroo Jack has successfully tapped into America&#39;s longstanding love affair with rapping kangaroos, taking in a box-office-best $17.7 million in its opening weekend. &quot;From Krazy Legs Kangol in the early &#39;80s to such New School acts as Pouch Gangstas and Tha Mar$upials, kangaroos have always been at the forefront of the rap scene,&quot; media analyst Glen Coffey said. &quot;But not until now has anyone had the vision to exploit this trend in a full-length feature film.&quot; Warner Bros. has already confirmed plans for a sequel, Koala Bob, featuring a computer-generated beat-boxing koala who steals $50 million in gold bullion… and he&#39;s not giving it back.\n\n'

In [60]:
o['article'] = o['article'].apply(clean_unicode)

In [61]:
o['article'] = o['article'].apply(remove_punctuation)

In [62]:
o['article'][0]

"HOLLYWOOD  CA The new Jerry Bruckheimer comedy Kangaroo Jack has successfully tapped into America's longstanding love affair with rapping kangaroos  taking in a box office best  17 7 million in its opening weekend   From Krazy Legs Kangol in the early '80s to such New School acts as Pouch Gangstas and Tha Mar upials  kangaroos have always been at the forefront of the rap scene   media analyst Glen Coffey said   But not until now has anyone had the vision to exploit this trend in a full length feature film   Warner Bros  has already confirmed plans for a sequel  Koala Bob  featuring a computer generated beat boxing koala who steals  50 million in gold bullion  and he's not giving it back "

In [56]:
' '.join(remove_stopwords_and_lemmatize_spacy(o['article'][0]))

"HOLLYWOOD   CA the new Jerry Bruckheimer comedy Kangaroo Jack successfully tap America ' s longstanding love affair rap kangaroo   take box office good   17 7 million opening weekend    from Krazy Legs Kangol early ' 80 New School act Pouch Gangstas Tha Mar upial   kangaroo forefront rap scene    medium analyst Glen Coffey say    but vision exploit trend length feature film    Warner Bros   confirm plan sequel   Koala Bob   feature computer generate beat box koala steal   50 million gold bullion   ' s give"

# Test performance for removing stopwords and lemmatizing 

In [39]:
%timeit remove_stopwords_and_lemmatize_nltk(example)

817 µs ± 123 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [109]:
%timeit remove_stopwords_and_lemmatize_spacy(example)

44.7 ms ± 178 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


NLTK seems way faster but it doesn't provide the same results 

In [108]:
%timeit remove_stopwords_and_lemmatize_spacy_fast(example)

21.6 ms ± 384 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [120]:
%timeit remove_stopwords_and_lemmatize_spacy_batch([example])

21.9 ms ± 560 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [129]:
%timeit remove_stopwords_and_lemmatize_spacy_batch(df['article'][0:2000])

1min 32s ± 1.36 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [126]:
%timeit df['article'][0:100].apply(remove_stopwords_and_lemmatize_spacy_fast)

4.9 s ± 120 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [130]:
%timeit remove_stopwords_and_lemmatize_spacy_multiprocessed(df['article'][0:2000])

40.1 s ± 1.07 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Preprocessing the text

In [51]:
#for t1, t2, t3 in zip(word_tokenize(example), example.split(" "), nlp(example)):
#    print(t1,"---------" , t2, '--------', t3.text)

In [57]:
df_short = df[:1000]

In [58]:
df_short['article'] = df_short['article'].apply(remove_stopwords_and_lemmatize_spacy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['article'] = df_short['article'].apply(remove_stopwords_and_lemmatize_spacy)


In [60]:
#Using Tokenizer from tf.keras
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
tokenizer.fit_on_texts(df_short['article'])
word_index = tokenizer.word_index
print(len(word_index))

26116


In [66]:
print(list(word_index.items())[:10])

[('<OOV>', 1), (',', 2), ('.', 3), ('"', 4), ('-', 5), ('i', 6), ('say', 7), ('the', 8), ('year', 9), ('–', 10)]


In [67]:
tokenizer.texts_to_sequences(df_short['article'][0])

[[1],
 [61],
 [1],
 [297],
 [1],
 [1],
 [280],
 [1],
 [1],
 [299],
 [1],
 [22],
 [],
 [1],
 [1],
 [42],
 [1],
 [],
 [1],
 [277],
 [300],
 [183],
 [1],
 [280],
 [77],
 [341],
 [90],
 [725],
 [65],
 [294],
 [226],
 [1],
 [434],
 [],
 [],
 [198],
 [240],
 [],
 [41],
 [417],
 [1],
 [1],
 [166],
 [1],
 [1],
 [],
 [145],
 [297],
 [299],
 [],
 [1],
 [1],
 [583],
 [],
 [26],
 [1],
 [59],
 [984],
 [394],
 [90],
 [9],
 [184],
 [],
 [],
 [7],
 [1],
 [],
 [1],
 [1],
 [1],
 [1],
 [],
 [66],
 [72],
 [1],
 [133],
 [],
 [813],
 [1],
 [1],
 [341],
 [1],
 [],
 [1],
 [1],
 [],
 [280],
 [53],
 [1],
 [],
 [],
 [30],
 [64],
 [292],
 [1],
 [1],
 [],
 [1],
 [1],
 [],
 [774],
 [1],
 [1],
 [1],
 [445],
 [14],
 [41],
 [280],
 [1],
 [59],
 [74],
 [299],
 [],
 [78],
 [105],
 [74],
 [1],
 [887],
 [68],
 [122],
 [],
 [1],
 [315],
 [72],
 [],
 [],
 [8],
 [24],
 [183],
 [140],
 [1],
 [],
 [1],
 [1],
 [1],
 [184],
 [],
 [1],
 [38],
 [1],
 [1],
 [341],
 [495],
 [],
 []]

In [73]:
# Useless as it doesn't speed up the process

async def lemmatize_token(token):
    if token.text not in STOP_WORDS:
        return token.lemma_
    else:
        return ""

async def remove_stopwords_and_lemmatize_async(text):
    doc = nlp(text)
    tasks = [lemmatize_token(token) for token in doc]
    lemmatized_tokens = await asyncio.gather(*tasks)
    return lemmatized_tokens