# NLP with NLTK and Word2Vec

Here we're working with the preprocessed movies dataset to figure out how to map these descriptions into some subspace of interest.

In [1]:
import gensim
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#### NLTK packages

In [53]:
nltk.download('punkt')
# download stop words using nltk
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/Greg/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /Users/Greg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Import NLP dataset

In [2]:
df = (pd.read_csv("df_nlp.csv", header=0, sep=',', encoding='latin-1')
    .dropna(subset=['title', 'id', 'overview'])
    .drop('original_title', axis=1)
    .set_index('title'))

### Set to lowercase

In [85]:
df['overview'] = df['overview'].str.lower()

### Remove fullstop endings

In [87]:
df.loc[df['overview'].str.endswith('.'), 'overview'] = df.loc[df['overview'].str.endswith('.'), 'overview'].str[:-1]

### Tokenize sentences

In [103]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

['hey', 'how', 'are', 'you', 'buddy']


In [108]:
df['overview_tokens'] = df['overview'].apply(tokenizer.tokenize).apply(lambda x: ' '.join(x))

### Expand

In [113]:
# expand out
overview_exp = df['overview_tokens'].str.split(" ", expand=True)

### Using NLTK corpora of English *stop words* to filter

In [91]:
from nltk.corpus import stopwords

In [110]:
stop = stopwords.words('english')

In [111]:
filter_w = "^(?:" + "|".join(stop).replace("'", "\'") + '|\s)$'

In [112]:
filter_w

"^(?:i|me|my|myself|we|our|ours|ourselves|you|you're|you've|you'll|you'd|your|yours|yourself|yourselves|he|him|his|himself|she|she's|her|hers|herself|it|it's|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|that'll|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|with|about|against|between|into|through|during|before|after|above|below|to|from|up|down|in|out|on|off|over|under|again|further|then|once|here|there|when|where|why|how|all|any|both|each|few|more|most|other|some|such|no|nor|not|only|own|same|so|than|too|very|s|t|can|will|just|don|don't|should|should've|now|d|ll|m|o|re|ve|y|ain|aren|aren't|couldn|couldn't|didn|didn't|doesn|doesn't|hadn|hadn't|hasn|hasn't|haven|haven't|isn|isn't|ma|mightn|mightn't|mustn|mustn't|needn|needn't|shan|shan't|shouldn|shouldn't|wasn|wasn't|weren|weren't|won|won't|wouldn|wouldn't|\\s)$"

## Filter out words

In [114]:
repl = overview_exp.replace(filter_w, np.nan, regex=True).replace('', np.nan)

In [115]:
list_lists = repl.apply(lambda x: x.str.cat(sep=';'), axis=1).str.split(';')

In [121]:
print(repl.shape)
repl.head()

(44366, 328)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,318,319,320,321,322,323,324,325,326,327
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,led,,woody,andy,,toys,live,happily,,,...,,,,,,,,,,
Jumanji,,siblings,judy,,peter,discover,,enchanted,board,game,...,,,,,,,,,,
Grumpier Old Men,,family,wedding,reignites,,ancient,feud,,next,door,...,,,,,,,,,,
Waiting to Exhale,cheated,,mistreated,,stepped,,,women,,holding,...,,,,,,,,,,
Father of the Bride Part II,,,george,banks,,recovered,,,daughter,,...,,,,,,,,,,


In [118]:
word2vec_ready = list_lists.tolist()

## Deploying Word2Vec

In [140]:
model_cwob = gensim.models.Word2Vec(word2vec_ready, min_count=1, size=50, workers=3, window=3, sg=1)

In [145]:
X = model_cwob[model_cwob.wv.vocab]

  """Entry point for launching an IPython kernel.


### T-distributed stochastic embedding

In [149]:
from sklearn.manifold import TSNE

In [None]:
ts = TSNE(n_components=2).fit(X)

In [143]:
model_cwob.most_similar('blossom')[:10]

  """Entry point for launching an IPython kernel.


[('heartache', 0.9750373363494873),
 ('blossoming', 0.9713245630264282),
 ('blossoms', 0.9690766334533691),
 ('tentative', 0.9631861448287964),
 ('jeopardized', 0.9607797861099243),
 ('flourish', 0.9602369666099548),
 ('blooms', 0.9594870805740356),
 ('amber', 0.9592205286026001),
 ('zest', 0.959153413772583),
 ('deepens', 0.9579893350601196)]