## word2vec implementation with Python (& Gensim)
- Note: This code is written in Python 3.6.1 (+Gensim 2.3.0)

In [None]:
# https://radimrehurek.com/gensim/models/word2vec.html

In [1]:
import re
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial



In [2]:
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Domonkos\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

### Import training dataset
- Import Shakespeare's Hamlet corpus from nltk library

In [3]:
sentences = list(gutenberg.sents('shakespeare-hamlet.txt'))   # import the corpus and convert into a list

In [4]:
print('Type of corpus: ', type(sentences))
print('Length of corpus: ', len(sentences))

Type of corpus:  <class 'list'>
Length of corpus:  3106


In [5]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']
['Actus', 'Primus', '.']
['Fran', '.']


### Preprocess data
- Use re module to preprocess data
- Convert all letters into lowercase
- Remove punctuations, numbers, etc.

In [6]:
for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]  

In [7]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']
['actus', 'primus']
['fran']


### Create and train model
- Create a word2vec model and train it with Hamlet corpus
- Key parameter description (https://radimrehurek.com/gensim/models/word2vec.html)
    - **sentences**: training data (has to be a list with tokenized sentences)
    - **size**: dimension of embedding space
    - **sg**: CBOW if 0, skip-gram if 1
    - **window**: number of words accounted for each context (if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    - **min_count**: minimum count of words to be included in the vocabulary
    - **iter**: number of training iterations
    - **workers**: number of worker threads to train

In [8]:
model = Word2Vec(sentences = sentences, vector_size = 100, sg = 1, window = 3, min_count = 1, workers = Pool()._processes, epochs = 10)

In [9]:
#model.init_sims(replace = True)

### Save and load model
- word2vec model can be saved and loaded locally
- Doing so can reduce time to train model again

In [10]:
model.save('word2vec_model')

### Similarity calculation
- Similarity between embedded words (i.e., vectors) can be computed using metrics such as cosine similarity
- For other metrics and comparisons between them, refer to: https://github.com/taki0112/Vector_Similarity

In [11]:
model = Word2Vec.load('word2vec_model')
y = model.wv.most_similar('hamlet', topn=15)

In [12]:
y

[('horatio', 0.9919930696487427),
 ('queene', 0.9906439185142517),
 ('oh', 0.9893236756324768),
 ('laertes', 0.9882543087005615),
 ('ghost', 0.9880845546722412),
 ('ophelia', 0.9879318475723267),
 ('polonius', 0.9877427220344543),
 ('deere', 0.9874640703201294),
 ('heere', 0.9873738884925842),
 ('king', 0.9872870445251465),
 ('mother', 0.9863444566726685),
 ('marcellus', 0.9861475825309753),
 ('sweet', 0.9858973026275635),
 ('rosincrane', 0.985610842704773),
 ('indeed', 0.9855900406837463)]