# Preprocessing for Natural Language Processing

In [1]:
import spacy
import re
from collections import Counter
import numpy as np
from bs4 import BeautifulSoup
from spacy.matcher import PhraseMatcher
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import remove_stopwords
from num2words import num2words
import phonetics
import autocorrect
from spellchecker import SpellChecker
import string
from spacy_symspell import SpellingCorrector
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from spacy.util import minibatch
import random

In [2]:
# nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


Downloader>  q


## Text encoding

Should probably use UTF-8 for most cases.

## Text substitution, remove null and empty values

Remove noise from raw format -> extract text from html, xml, ...

In [3]:
def extract_text_from_markup(markup_text):
    soup = BeautifulSoup(markup_text)
    lines = soup.text.split('\n')
    tokens = [token.lower().strip() for token in lines if token.strip()]
    return set(tokens)


with open('data/firefox.html') as g:
    html = g.read()

extract_text_from_markup(html)

{'advanced install options & other platforms',
 'android',
 'automatic privacy is here. download firefox to block over 2000 trackers.',
 "avañe'ẽ",
 'bahasa indonesia',
 'benefits',
 'beta',
 'beta for android',
 'brand standards',
 'browsers',
 'cookies',
 'cymraeg',
 'dansk',
 'desktop',
 'deutsch',
 'developer edition',
 'developers',
 'dolnoserbšćina',
 'download firefox',
 'download firefox for linux',
 'download firefox for macos',
 'download firefox for windows',
 'download firefox — free web browser',
 'download in another language',
 'download now',
 'english',
 'english (british)',
 'english (canadian)',
 'enterprise',
 'español (de argentina)',
 'español (de chile)',
 'español (de españa)',
 'español (de méxico)',
 'euskara',
 'firefox',
 'firefox lockwise makes the passwords you save in firefox secure and available on all your devices.',
 'firefox monitor alerts you if we know your information is a part of another company’s data breach.',
 'firefox privacy notice',
 'firefo

## Remove stop words

Commonly occurring words which are not relevant in the context of data.

Do not remove for speech tagging or parsing.

In [4]:
sentence = "My sentence will not contain stopwords after processing because they will be removed."

In [5]:
nlp = spacy.load("en")

In [6]:
filtered = list()
nlp.vocab["not"].is_stop = False
nlp.vocab["n't"].is_stop = False
for word in sentence.split(' '):
    if not nlp.vocab[word].is_stop:
        filtered.append(word)
print(filtered)

['sentence', 'not', 'contain', 'stopwords', 'processing', 'removed.']


In [7]:
#nltk

filtered = list()
nltk_stopwords = set(stopwords.words('english'))
for word in sentence.split(' '):
    if word not in nltk_stopwords:
        filtered.append(word)
print(filtered)

['My', 'sentence', 'contain', 'stopwords', 'processing', 'removed.']


In [8]:
# gensim
remove_stopwords(sentence).split(' ')

['My', 'sentence', 'contain', 'stopwords', 'processing', 'removed.']

## Tokenize

Split sentences in list of words, ...

In [9]:
nlp = spacy.load('en')
doc = nlp("Tea is healthy and calming, don't you think?")
print(f'Token\tpos\tlemma\tstop_words')
for token in doc:
    print(f"{token}\t{token.pos_}\t{token.lemma_}\t{token.is_stop}")

Token	pos	lemma	stop_words
Tea	NOUN	tea	False
is	AUX	be	True
healthy	ADJ	healthy	False
and	CCONJ	and	True
calming	VERB	calm	False
,	PUNCT	,	False
do	AUX	do	True
n't	PART	not	True
you	PRON	-PRON-	True
think	VERB	think	False
?	PUNCT	?	False


## Normalize

* change numbers to word equivalent
* lowercase everything except for sentiment analysis because we lose information with that
* negation handling: aren't -> are not
* remove standalone puncuations such as commas, dots, ...
* change plurals to singular
* lemmatization: find lemma of word (Gensim lib, spacy)
* stemming: take the root of the word, can give inaccurate results but faster than lemmatization

In [10]:
def normalize_text(text):
    text = text.lower().strip()
    nlp = spacy.load("en")
    
    nlp.vocab["not"].is_stop = False
    nlp.vocab["n't"].is_stop = False

    doc = nlp(text)
    
    output_token = list()
    for token in doc:
        print(f"{token}\t{token.pos_}\t{token.lemma_}\t{token.is_stop}")
        token_str = token.text
        if token_str.isdigit():
            output_token.append(num2words(token_str))
        else:
            lemma = token.lemma_
            if token.is_punct or token.is_stop:
                continue
            output_token.append(lemma)
    return output_token

In [11]:
text = "I am a 50 years old man who isn't afraid to get his hands dirty! I like reading books, learning Deep Learning with PyTorch and Kaggle."
normalize_text(text)

i	PRON	i	True
am	AUX	be	True
a	DET	a	True
50	NUM	50	False
years	NOUN	year	False
old	ADJ	old	False
man	NOUN	man	False
who	PRON	who	True
is	AUX	be	True
n't	PART	not	False
afraid	ADJ	afraid	False
to	PART	to	True
get	AUX	get	True
his	DET	-PRON-	True
hands	NOUN	hand	False
dirty	ADJ	dirty	False
!	PUNCT	!	False
i	PRON	i	True
like	VERB	like	False
reading	VERB	read	False
books	NOUN	book	False
,	PUNCT	,	False
learning	VERB	learn	False
deep	ADJ	deep	False
learning	VERB	learn	False
with	ADP	with	True
pytorch	NOUN	pytorch	False
and	CCONJ	and	True
kaggle	NOUN	kaggle	False
.	PUNCT	.	False


['fifty',
 'year',
 'old',
 'man',
 'not',
 'afraid',
 'hand',
 'dirty',
 'like',
 'read',
 'book',
 'learn',
 'deep',
 'learn',
 'pytorch',
 'kaggle']

## Replace synonyms by a same word

would that be useful ? This could remove noise from the text but depending on the NLP use case, we would lose information.

want, need, required could be classified as strongly similar but "want" and "need" have different meaning.

There doesn't seem to have an easy solution for this so maybe that's not really useful, there is a python lib that finds very similar words using wordnet : https://pypi.org/project/spacy-wordnet/

If needed, it should be possible to scrap an online dictionary to extract synonyms for words but we may have weird results. Cambridge dictionary has fuel injection as a synonym for car.

## Phonetic hashing

* combines the same phonemes (smallest unit of sound) into one bucket and gives them the same hash code for all the variations => colour and color have the same code
* technique used to canocalize words that have different variants but same phonetic characteristics, that is, the same pronunciation
* Soundexes are algorithm that can be used to calculate the hash code of a given word
    * algo differ from language to language

In [12]:
print(phonetics.soundex("colour"))
print(phonetics.soundex("color"))

c0406
c0406


## Spelling corrector

Nothing really amazing, need to find better solutions.

In [13]:
text = "My text hase eror, I writee lik dat. How r u? Ur an idiot."

In [14]:
regex = re.compile(r'[,?\.]')

In [15]:
tokens = [regex.sub('', word) for word in text.split()]

In [16]:
spell_checker = SpellChecker()
misspelled_words = spell_checker.unknown(tokens)
print(misspelled_words)

{'writee', 'hase', 'eror'}


In [17]:
for misspelled_word in misspelled_words:
    print(f'{misspelled_word} -> {spell_checker.correction(misspelled_word)} -> {spell_checker.candidates(misspelled_word)}')

writee -> write -> {'writhe', 'write', 'writes', 'writer'}
hase -> have -> {'haser', 'haase', 'hate', 'hash', 'base', 'hage', 'ease', 'hast', 'haue', 'hose', 'have', 'hause', 'haze', 'hse', 'phase', 'case', 'haste', 'has', 'hale', 'hasp', 'hake', 'hare', 'vase', 'hae', 'ase', 'hanse', 'chase'}
eror -> error -> {'eros', 'ebor', 'error', 'emor', 'err', 'egor'}


In [18]:
spell_corrector = autocorrect.Speller(lang="en")
for word in tokens:
    corrected_word = spell_corrector(word)
    if word != corrected_word:
        print(f"{word} -> {corrected_word}")

hase -> have
eror -> error
lik -> like


## Pattern matching

In [19]:
nlp = spacy.load('en')
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)

In [20]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") 
matches = matcher(text_doc)

In [21]:
for match in matches:
    match_id, start, end = match
    print(nlp.vocab.strings[match_id], text_doc[start:end])
    

TerminologyList iPhone 11
TerminologyList Galaxy Note
TerminologyList iPhone XS
TerminologyList Google Pixel


## Word embeddings

AKA word vectors represent each word by a vector that keeps the info on how the word is used and what it means. Words that appear in similar contexts will have similar vectors.

In [22]:
spam = pd.read_csv('data/spam.csv')

with nlp.disable_pipes():
    doc_vectors = np.array([nlp(text).vector for text in spam.text])
    
doc_vectors.shape

(5572, 96)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, spam.label,
                                                    test_size=0.1, random_state=1)

In [24]:
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train, y_train)
print(f"Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

Accuracy: 94.444%


## Document similarity

* documents with similar content generally have similar vectors
* we can find similar documents by measuring the similarity between the vectors.

A common metric for this is the cosine simillarity which measures the angle between two vectors, a and b.

$cos(\theta) = \frac{a * b}{\vert\vert a \vert\vert * \vert\vert b \vert\vert}$

=> dot product of a and b, divided by the magnitude of each vector. Cosine similarity varies between -1 and 1 corresponding
complete opposite to perfect similarity respectively.

In [25]:
nlp = spacy.load('en_core_web_lg')

In [26]:
a = nlp("REPLY NOW FOR FREE TEA")
b = nlp("According to legend, Emperor Shen Nung discovered tea when leaves from a wild tree blew into his pot of boiling water.")
a.similarity(b)

0.7030030981818236

In [27]:
text = "According to legend, Emperor Shen Nung discovered tea when leaves from a wild tree blew into his pot of boiling water."
a = nlp(text)
b = nlp(text)
a.similarity(b)

1.0

In [28]:
a = nlp("Alice in wonderland")
b = nlp("Python is a nice programming language.")
a.similarity(b)

0.3382732593002094

## spaCy pipelines

similar to sklearn pipelines

https://spacy.io/usage/processing-pipelines


In [29]:
spam = pd.read_csv('data/spam.csv')
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
nlp = spacy.blank("en")

textcat = nlp.create_pipe("textcat", config={
                "exclusive_classes": True,
                "architecture": "bow"})

nlp.add_pipe(textcat)
textcat.add_label("ham")
textcat.add_label("spam")
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in spam['label']]

In [31]:
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [32]:
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
epoch_count = 1
for epoch in range(20):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses["textcat"] / epoch_count)
    epoch_count += 1

1.3387270104194613
0.8369239149687928
0.6208181146248227
0.4958138850246252
0.4128723095680268
0.35273815350844373
0.3075636856317985
0.27217805334311457
0.24401554055971772
0.2210127389680426
0.2019711623467173
0.18582977406817336
0.1719950088306258
0.16005479902147526
0.14967598166822854
0.14055339618562634
0.13247412087174254
0.12527049393425266
0.11880373060797
0.1133074344318854


In [33]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[9.9999750e-01 2.5330050e-06]
 [1.5455886e-03 9.9845445e-01]]


In [34]:
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']


In [35]:
texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])

[('$9.4 million', 'MONEY'), ('the prior year', 'DATE'), ('$2.7 million', 'MONEY')]
[('twelve billion dollars', 'MONEY'), ('1b', 'MONEY')]
