In [25]:
import pandas as pd
import re
import spacy
from time import time
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

In [26]:
wordnet_lemmatizer = WordNetLemmatizer()

In [27]:
df = pd.read_csv("/home/mr-francis/shakespear/simpsons_dataset.csv")
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [28]:
df.isna().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [29]:
df = df.dropna().reset_index(drop=True)

In [30]:
# def cleaning(doc):
#     # Lemmatizes and removes stopwords
#     # doc needs to be a spacy Doc object
#     txt = [token.lemma_ for token in doc if not token.is_stop]
#     # Word2Vec uses context words to learn the vector representation of a target word,
#     # if a sentence is only one or two words long,
#     # the benefit for the training is very small
#     if len(txt) > 2:
#         return ' '.join(txt)

In [31]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [32]:
df_clean = pd.DataFrame({'clean': brief_cleaning})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(119516, 1)

Bigram 
We are using Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences.
Using the Bigram model to catch or capture words like "mr_burns" or "bart_simpson" !

In [33]:
from gensim.models.phrases import Phrases, Phraser

In [34]:
sent = [row.split() for row in df_clean['clean']]

Creates the relevant phrases from the list of sentences:

In [35]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [36]:
bigram = Phraser(phrases)

tokenizing sentences

In [37]:
sentences = bigram[sent]

In [38]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

40991

getting frequent words from the documents

In [39]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['the', 'you', 'i', 'a', 'to', 'and', 'of', 'it', 'my', 'that']

training the model using the gensim word2vec model

In [40]:
import multiprocessing
from gensim.models import Word2Vec

In [41]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [42]:
w2v_model = Word2Vec(min_count=20, #Ignores all words with total absolute frequency lower than this - (2, 100)
                     window=6,
                     size=300, #Dimensionality of the feature vectors. - (50, 300)
                     sample=6e-5,  #The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)
                     alpha=0.03,  #The initial learning rate - (0.01, 0.05)
                     min_alpha=0.0007, # Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
                     negative=20, #If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
                     workers=cores-1)

In [43]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.26 mins


In [44]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 8.25 mins


In [45]:
w2v_model.init_sims(replace=True)

In [46]:
w2v_model.wv.most_similar(positive=["homer"])

[('marge', 0.5390207767486572),
 ('homie', 0.4682053327560425),
 ('dad', 0.4646422863006592),
 ('becky', 0.41750073432922363),
 ('bart', 0.40315163135528564),
 ('gee', 0.3850695788860321),
 ('mom', 0.37586405873298645),
 ('abe', 0.37170642614364624),
 ('you', 0.361121267080307),
 ('husband', 0.35919415950775146)]

checking for similarities between words

In [47]:
w2v_model.wv.similarity("mom", "dad")

0.7489579746263939

checking for words that did not match from an array of words

In [48]:
w2v_model.doesnt_match(['good', 'failure', 'cute'])

  w2v_model.doesnt_match(['good', 'failure', 'cute'])
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'failure'