In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dialogue-lines-of-the-simpsons/simpsons_dataset.csv


In [2]:
import numpy as np
import pandas as pd
import re                 # regex text preprocessing
import time
from collections import Counter, defaultdict, OrderedDict # for word frequency

import spacy # open source tool for NLP

import multiprocessing
print('Number of cores in this machine: ', multiprocessing.cpu_count())

# import logging  # setting up logging to monitor gensim
# logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt = '%H:%M:%S', level= logging.INFO)

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

Number of cores in this machine:  4


## 1. Getting the dataset and preprocessing

In [3]:
df = pd.read_csv('/kaggle/input/dialogue-lines-of-the-simpsons/simpsons_dataset.csv')
print(df.shape)
display(df.head())

(158314, 2)


Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [4]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [5]:
# drop null values

df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

## 2. Text preprocessing

### 2.1 Cleaning text
Remove stop words, non-alphabetic characters and lemmatize each line of dialogue

Implement [spaCy pipelines](https://spacy.io/usage/processing-pipelines) for faster processing
* The pipeline used by the default models consists of a tagger, a parser and an entity recognizer
* pipeline components can be disabled for faster processing

Using `spaCy` model to first clean the text

In [6]:
# load spacy without 'Named Entity Recognition' and 'parser', for speed
nlp = spacy.load('en', disable =['ner', 'parser'])

def cleaning(doc):
    # lemmatize and remove stop words
    # for this doc has to be a 'spacy object'
    text = [token.lemma_ for token in doc if not token.is_stop]
    
    # word2vec uses context words to learn vector representations of target word
    # the benefit of training is small when the sentence is only 1 or 2 words long
    
    if len(text) > 2:
        return ' '.join(text)

Note: Once you convert the generator to a list, then the generator will not return any further items,which is why I have commented the line:
More details [here](https://stackoverflow.com/questions/24130745/convert-generator-object-to-list-for-debugging)
> list(brief_cleaning)[:5]

In [7]:
# remove non-alphabetic characters
# save the result as a 'generator' on which we can iterate
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])
#list(brief_cleaning)[:5]

In [8]:
# Taking advantage of spaCy.pipe() attribute to speed-up the cleaning process
# convert the doc into a spaCy object on which cleaning method can be applied

start = time.time()
text  = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size= 5000, n_threads =-1)]
print('time taken to clean text is {} min'.format((time.time() - start) / 60))

time taken to clean text is 1.27745174964269 min


Put the result into a dataframe to remove 'None' and duplicates

In [9]:
df_clean = pd.DataFrame(text)
df_clean.columns=['cleaned_text']
df_clean = df_clean.dropna().drop_duplicates()
print('Shape after cleaning', df_clean.shape)
df_clean.head(5)

Shape after cleaning (85964, 1)


Unnamed: 0,cleaned_text
0,actually little disease magazine news show nat...
2,know sure like talk touch lesson plan teach
3,life worth live
4,poll open end recess case decide thought final...
7,victory party slide


## 2.2 Bigrams
Use `Gensim Phrases` package to automatically detect common phrases(bigrams) from the list of dialogues. https://radimrehurek.com/gensim/models/phrases.html
The main reason we are doing this here is to catch phrases such as "mr_burns", "bart_simpson"

In [10]:
from gensim.models.phrases import Phrases, Phraser

The training corpus must be a sequence (stream, generator) of sentences,
with each sentence a list of tokens

In [11]:
%%time
sent_list = [row.split() for row in df_clean['cleaned_text']]

# create phrases using Gensim Phrases
phrases = Phrases(sent_list, min_count=30, progress_per=10000)

CPU times: user 1.34 s, sys: 32.8 ms, total: 1.37 s
Wall time: 1.37 s


The goal of `Phraser()` is to cut down memory consumption of `Phrases()`, by discarding model state not strictly needed for the bigram detection task:

In [12]:
bigram = Phraser(phrases)

Transform the corpus based on the bigrams detected:

In [13]:
sentences = bigram[sent_list]

In [14]:
bigrams_created = []
for sent in sentences:
    for word in sent:
        if '_' in word:
            bigrams_created.append(word)
print('bigrams created using Gensim Phrases: \n\n', set(bigrams_created))

bigrams created using Gensim Phrases: 

 {'nyah_nyah', 'lady_gentleman', 'ha_ha', 'power_plant', 'credit_card', "talkin_'", 'high_school', "gettin_'", 'mom_dad', 'try_kill', 'doo_doo', 'ding_ding', 'year_ago', 'springfield_elementary', 't_shirt', 'tap_tap', 'santa_little', 'na_na', 'principal_skinner', 'montgomery_burn', 'capital_city', 'little_bit', 'young_lady', 'disco_stu', 'chief_wiggum', 'good_luck', 'tell_truth', 'homer_simpson', 'ice_cream', 'kwik_e', 'important_thing', 'da_da', 'new_york', 'mr_burns', 'big_deal', 'old_fashioned', 'world_war', 'hot_dog', 'sideshow_bob', 'oh_god', 'whoa_whoa', 'bart_simpson', 'mmm_hmm', 'dear_lord', 'fall_asleep', 'smell_like', 'old_man', 'wait_minute', 'sound_like', 'fat_tony', 'mrs_krabappel', 'la_la', 'yi_yi', 'u_s', 'uh_huh', 'got_to', 'mr_simpson', 'bad_news', "'_n", 'good_morning', 'cell_phone', 'ho_ho', 'patty_selma', "'_til", 'lisa_simpson', 'pork_chop', 'get_to', 'bye_bye', 'miss_hoover', 'wait_wait', 'van_houten', 'ow_ow', 'good_friend'

## 2.3 Most frequent words
Sanity check for effectiveness of:
- lemmatization
- removal of stop words
- addition of bigrams

In [15]:
word_freq = defaultdict(int)
for sent in sentences:
    for word in sent:
        word_freq[word]+=1

print('Number of unique words in the corpus', len(word_freq))

Number of unique words in the corpus 30178


Sort python dictionary. Refer this [link](https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value)
There are multiple ways:
- for python 3: sorted(x.items(), lambda kv: kv[1]) - to sort by values
- for python 3: sorted(x.items(), lambda kv: kv[0]) - to sort by key

In [16]:
#sorted(word_freq.items(), key = lambda kv: kv[1])[::-1]
print(sorted(word_freq.items(), key = lambda kv: kv[1], reverse=True)[:10])

print('\nthe most common words are, taking 10 for instance\n', sorted(word_freq, key = word_freq.get, reverse=True)[:10])

[('oh', 6453), ('like', 5599), ('know', 4819), ('get', 4197), ('hey', 3620), ('think', 3594), ('right', 3406), ('look', 3375), ('want', 3181), ('come', 3161)]

the most common words are, taking 10 for instance
 ['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']


## 3. Training the model

### Gensim Word2Vec implementation
Implementing Gensim word2vec as mentioned [here](https://radimrehurek.com/gensim/models/word2vec.html)

In [17]:
from gensim.models import Word2Vec

### Training of the model will be done in 3 steps:

1. `Word2Vec`: 
    > setting up the parameters one by one without supplying the parameter `sentences` and therefore leave the model uninitialized, purposefully.
2. `.build_vocab()`: 
    > builds vocabulary from the sentence sequence and thus initialize the model. With the loggings, I can follow the progress and even more important, the effect of min_count and sample on the word corpus. I noticed that these two parameters, and in particular sample, have a great influence over the performance of a model. Displaying both allows for a more accurate and an easier management of their influence.
3. `.train()`: 
    > trains the model. The loggings here useful for monitoring and making sure not threads execute simultaneously

In [18]:
# getting the number of cores on the machine to find how many threads can be executed in parallel
cores = multiprocessing.cpu_count()

### Word2Vec parameters

* `min_count`: ignore words with frequency less than this
* `window`: the max distance between the current words and predicted word in a sentence
* `size`: dimensionality of feature vector
* `sample`: The threshold for configuring which higher-frequency words are randomly downsampled
* `alpha`: The initial learning rate
* `min_alpha` :Learning rate will linearly drop to `min_alpha` as training progresses
* `max_vocab_size`: Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones
* `negative` :  If > 0, negative sampling will be used, the int for negative specifies how many "noise words"should be drawn (usually between 5-20)
* `workers`: number of threads to train the model

In [19]:
# Create an instance of Word2Vec without intializing it using sequence of sentences

w2v_model = Word2Vec(alpha = 0.03, 
                     window = 2, 
                     min_count = 20, 
                     sample = 6e-5,
                     min_alpha = 0.0007,
                     negative = 20,
                     workers = cores
                     )

### Build the vocabulary table

Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [20]:
start = time.time()

w2v_model.build_vocab(sentences, progress_per = 10000)
print('time taken to build vocabulary is {} min'.format((time.time() - start) / 60))

time taken to build vocabulary is 0.0460463007291158 min


### Train the model

* total_examples = int - Count of sentences;
* epochs = int - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [21]:
start = time.time()

w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs= 30, report_delay=10)
print('time taken to train the model is {} min'.format((time.time() - start) / 60))

time taken to train the model is 1.2311998883883157 min


As we do not plan to train the model any further, we are calling init_sims(), which will make the model much more memory-efficient.
This will replace the weights trained with their normalized values. This step is irreversible

In [22]:
w2v_model.init_sims(replace=True)

## 4. Explore the model

### 4.1 Most similar to

In [23]:
w2v_model.wv.most_similar('homer')

[('rude', 0.7671548128128052),
 ('bongo', 0.7588388919830322),
 ('wife', 0.7429685592651367),
 ('marge', 0.7415707111358643),
 ('snuggle', 0.734971284866333),
 ('worry', 0.7316248416900635),
 ('attract', 0.7215455770492554),
 ('sweetheart', 0.7214065790176392),
 ('adopt', 0.7213941812515259),
 ('dr_hibbert', 0.7210967540740967)]

In [24]:
w2v_model.wv.most_similar('homer_simpson')

[('recent', 0.756310224533081),
 ('hutz', 0.7376375198364258),
 ('easily', 0.737457275390625),
 ('governor', 0.7340445518493652),
 ('congratulation', 0.7321300506591797),
 ('simon', 0.7197277545928955),
 ('waylon', 0.7154708504676819),
 ('robert', 0.7027060985565186),
 ('erotic', 0.6951093673706055),
 ('council', 0.692208468914032)]

In [25]:
w2v_model.wv.most_similar('bart')

[('lisa', 0.828191876411438),
 ('homework', 0.7757643461227417),
 ('mom', 0.7673578262329102),
 ('strangle', 0.7603084444999695),
 ('grown', 0.7595598697662354),
 ('substitute', 0.7528258562088013),
 ('convince', 0.7512975931167603),
 ('ralphie', 0.7400736808776855),
 ('surprised', 0.7386853694915771),
 ('badly', 0.7379380464553833)]

### 4.2 Similarities between words

In [26]:
w2v_model.wv.similarity('bart_simpson', 'child')

0.57327795

In [27]:
w2v_model.wv.similarity('maggie', 'baby')

0.73507625

### 4.3 Finding the odd one out
Find the word that does not belong to the list

In [28]:
w2v_model.wv.doesnt_match(['homer', 'bart', 'maggie', 'hibbert'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'hibbert'

In [29]:
w2v_model.wv.doesnt_match(['bart', 'milhouse', 'nelson'])

'nelson'

### 4.4 Analogy difference

Which word is to woman as homer is to marge ?

In [35]:
w2v_model.wv.most_similar(positive=['woman', 'homer'], negative=['marge'], topn=3)

[('admire', 0.6312659978866577),
 ('wife', 0.6125058531761169),
 ('wonder', 0.6094111204147339)]

In [36]:
# What word is to woman as bart is to man ?
w2v_model.wv.most_similar(positive=['woman', 'bart'], negative=['man'], topn=3)

[('pregnant', 0.717870831489563),
 ('parent', 0.7156802415847778),
 ('lisa', 0.6839100122451782)]