## word2vec implementation with Python (& Gensim)
- Note: This code is written in Python 3.6.1 (+Gensim 2.3.0)

In [1]:
import nltk
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import re
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial

### Import training dataset
- Import Shakespeare's Hamlet corpus from nltk library

In [8]:
sentences = list(gutenberg.sents('/content/sample_data/shakespeare.txt'))#shakespeare-hamlet.txt'))   # import the corpus and convert into a list

In [9]:
print('Type of corpus: ', type(sentences))
print('Length of corpus: ', len(sentences))

Type of corpus:  <class 'list'>
Length of corpus:  99772


In [10]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['From', 'fairest', 'creatures', 'we', 'desire', 'increase', ',', 'That', 'thereby', 'beauty', "'", 's', 'rose', 'might', 'never', 'die', ',', 'But', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', ',', 'His', 'tender', 'heir', 'might', 'bear', 'his', 'memory', ':', 'But', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', ',', 'Feed', "'", 'st', 'thy', 'light', "'", 's', 'flame', 'with', 'self', '-', 'substantial', 'fuel', ',', 'Making', 'a', 'famine', 'where', 'abundance', 'lies', ',', 'Thy', 'self', 'thy', 'foe', ',', 'to', 'thy', 'sweet', 'self', 'too', 'cruel', ':', 'Thou', 'that', 'art', 'now', 'the', 'world', "'", 's', 'fresh', 'ornament', ',', 'And', 'only', 'herald', 'to', 'the', 'gaudy', 'spring', ',', 'Within', 'thine', 'own', 'bud', 'buriest', 'thy', 'content', ',', 'And', 'tender', 'churl', 'mak', "'", 'st', 'waste', 'in', 'niggarding', ':', 'Pity', 'the', 'world', ',', 'or', 'else', 'this', 'glutton', 'be', ',', 'To', 'eat', 'the', 'world', "'", 's', '

### Preprocess data
- Use re module to preprocess data
- Convert all letters into lowercase
- Remove punctuations, numbers, etc.

In [11]:
for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]

In [12]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beauty', 's', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feed', 'st', 'thy', 'light', 's', 'flame', 'with', 'self', 'substantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy', 'self', 'thy', 'foe', 'to', 'thy', 'sweet', 'self', 'too', 'cruel', 'thou', 'that', 'art', 'now', 'the', 'world', 's', 'fresh', 'ornament', 'and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring', 'within', 'thine', 'own', 'bud', 'buriest', 'thy', 'content', 'and', 'tender', 'churl', 'mak', 'st', 'waste', 'in', 'niggarding', 'pity', 'the', 'world', 'or', 'else', 'this', 'glutton', 'be', 'to', 'eat', 'the', 'world', 's', 'due', 'by', 'the', 'grave', 'and', 'thee']
['when', 'forty', 'winters', 'shall', 'besiege', 'thy', 'brow', 'an

### Create and train model
- Create a word2vec model and train it with Hamlet corpus
- Key parameter description (https://radimrehurek.com/gensim/models/word2vec.html)
    - **sentences**: training data (has to be a list with tokenized sentences)
    - **size**: dimension of embedding space
    - **sg**: CBOW if 0, skip-gram if 1
    - **window**: number of words accounted for each context (if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    - **min_count**: minimum count of words to be included in the vocabulary
    - **iter**: number of training iterations
    - **workers**: number of worker threads to train

In [13]:
#model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)
model = Word2Vec(sentences = sentences, vector_size = 100, sg = 1, window = 3, min_count = 1, epochs = 10, workers = Pool()._processes)


In [14]:
model.init_sims(replace = True)

  model.init_sims(replace = True)


### Save and load model
- word2vec model can be saved and loaded locally
- Doing so can reduce time to train model again

In [15]:
model.save('word2vec_model')

In [16]:
model = Word2Vec.load('word2vec_model')

### Similarity calculation
- Similarity between embedded words (i.e., vectors) can be computed using metrics such as cosine similarity
- For other metrics and comparisons between them, refer to: https://github.com/taki0112/Vector_Similarity

In [17]:
from gensim.models import KeyedVectors

In [18]:
keyed_vectors = model.wv
keyed_vectors.most_similar('hamlet')

[('benedick', 0.8010129332542419),
 ('fenton', 0.7995469570159912),
 ('constance', 0.7940218448638916),
 ('eleanor', 0.781517744064331),
 ('julia', 0.7814662456512451),
 ('horatio', 0.7776274681091309),
 ('beatrice', 0.7769432067871094),
 ('ophelia', 0.7766722440719604),
 ('laertes', 0.7747067809104919),
 ('nell', 0.7730135917663574)]

In [None]:
#model.most_similar('hamlet')

In [19]:
v1 = keyed_vectors['king']
v2 = keyed_vectors['queen']

In [20]:
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [21]:
cosine_similarity(v1, v2)

0.6272789239883423