<a href="https://colab.research.google.com/github/jonkrohn/DLTFpT/blob/master/notebooks/natural_language_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Preprocessing

In this notebook, we clean up a dataset of natural language data (books from Project Gutenberg) and use word2vec to embed the language in word vectors.

**N.B.:** Some, all or none of these preprocessing steps may be helpful to a given downstream application. 

#### Load dependencies

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

import string

import gensim
from gensim.models.phrases import Phraser, Phrases
from gensim.models.word2vec import Word2Vec

import spacy # for a lemmatization example

from sklearn.manifold import TSNE

import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure

#### Load Data

In [None]:
from nltk.corpus import gutenberg

In [None]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [None]:
gberg_sents[0:6]

In [None]:
gberg_sents[4][14]

#### Iteratively preprocess a sentence

##### a tokenized sentence: 

In [None]:
gberg_sents[4]

##### to lowercase: 

In [None]:
[w.lower() for w in gberg_sents[4]]

##### remove stopwords and punctuation: 

In [None]:
stpwrds = stopwords.words('english') + list(string.punctuation)

In [None]:
stpwrds

In [None]:
[w.lower() for w in gberg_sents[4] if w.lower() not in stpwrds]

##### stem words: 

In [None]:
stemmer = PorterStemmer()

In [None]:
[stemmer.stem(w.lower()) for w in gberg_sents[4] if w.lower() not in stpwrds]

##### a lemmatization example: 

In [None]:
nlp = spacy.load('en_core_web_sm') 

In [None]:
gutenberg.raw()[291:477]

In [None]:
spacy_doc = nlp(gutenberg.raw()[291:477])

In [None]:
[w.lemma_ for w in spacy_doc]

##### handle bigram collocations:

In [None]:
phrases = Phrases(gberg_sents) # train detector

In [None]:
bigram = Phraser(phrases) # create a more efficient Phraser object for transforming sentences

In [None]:
bigram.phrasegrams # output score of each bigram

In [None]:
tokenized_sentence = "Jon lives in New York City".split()

In [None]:
tokenized_sentence

In [None]:
bigram[tokenized_sentence]

#### Preprocess the corpus

In [None]:
# as in Maas et al. (2001):
# - leave in stop words ("indicative of sentiment")
# - no stemming ("model learns similar representations of words of the same stem when data suggests it")
lower_sents = []
for s in gberg_sents:
    lower_sents.append([w.lower() for w in s if w.lower() not in list(string.punctuation)])

In [None]:
lower_sents[0:5]

In [None]:
lower_bigram = Phraser(Phrases(lower_sents))

In [None]:
lower_bigram.phrasegrams # miss taylor, mr woodhouse, mr weston

In [None]:
lower_bigram["jon lives in new york city".split()]

In [None]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))
lower_bigram.phrasegrams

In [None]:
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [None]:
clean_sents[0:9]

In [None]:
clean_sents[6] 

#### Run word2vec

In [None]:
# max_vocab_size can be used instead of min_count (which has increased here)
# model = Word2Vec(sentences=clean_sents, size=64, 
#                  sg=1, window=10, iter=5,
#                  min_count=10, workers=4)
# model.save('clean_gutenberg_model.w2v')

#### Explore model

In [None]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('clean_gutenberg_model.w2v') 

In [None]:
len(model.wv.vocab) # would be 17k if we carried out no preprocessing

In [None]:
model.wv['dog']

In [None]:
len(model.wv['dog'])

In [None]:
model.wv.most_similar('dog', topn=3)

In [None]:
model.wv.most_similar('eat', topn=3)

In [None]:
model.wv.most_similar('day', topn=3)

In [None]:
model.wv.most_similar('father', topn=3)

In [None]:
model.wv.most_similar('ma_am', topn=3) 

In [None]:
model.wv.doesnt_match("mother father sister brother dog".split())

In [None]:
model.wv.similarity('father', 'dog')

In [None]:
model.wv.most_similar(positive=['father', 'woman'], negative=['man']) 

In [None]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man']) 

#### Reduce word vector dimensionality with t-SNE

In [None]:
# tsne = TSNE(n_components=2, n_iter=1000)

In [None]:
# X_2d = tsne.fit_transform(model.wv[model.wv.vocab])

In [None]:
# coords_df = pd.DataFrame(X_2d, columns=['x','y'])
# coords_df['token'] = model.wv.vocab.keys()

In [None]:
# coords_df.head()

In [None]:
# coords_df.to_csv('clean_gutenberg_tsne.csv', index=False)

#### Visualise 

In [None]:
coords_df = pd.read_csv('clean_gutenberg_tsne.csv')

In [None]:
output_notebook()

In [None]:
subset_df = coords_df.sample(n=5000)

In [None]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [None]:
show(p)