## word2vec implementation with Python (& Gensim)
- Note: This code is written in Python 3.6.1 (+Gensim 2.3.0)

In [1]:
import nltk
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import re
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial

### Import training dataset
- Import Shakespeare's Hamlet corpus from nltk library

In [3]:
sentences = list(gutenberg.sents('shakespeare-hamlet.txt'))   # import the corpus and convert into a list

In [4]:
print('Type of corpus: ', type(sentences))
print('Length of corpus: ', len(sentences))

Type of corpus:  <class 'list'>
Length of corpus:  3106


In [5]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']
['Actus', 'Primus', '.']
['Fran', '.']


### Preprocess data
- Use re module to preprocess data
- Convert all letters into lowercase
- Remove punctuations, numbers, etc.

In [6]:
for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]

In [7]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']
['actus', 'primus']
['fran']


### Create and train model
- Create a word2vec model and train it with Hamlet corpus
- Key parameter description (https://radimrehurek.com/gensim/models/word2vec.html)
    - **sentences**: training data (has to be a list with tokenized sentences)
    - **size**: dimension of embedding space
    - **sg**: CBOW if 0, skip-gram if 1
    - **window**: number of words accounted for each context (if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    - **min_count**: minimum count of words to be included in the vocabulary
    - **iter**: number of training iterations
    - **workers**: number of worker threads to train

In [10]:
#model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)
model = Word2Vec(sentences = sentences, vector_size = 100, sg = 1, window = 3, min_count = 1, epochs = 10, workers = Pool()._processes)


In [11]:
model.init_sims(replace = True)

  model.init_sims(replace = True)


### Save and load model
- word2vec model can be saved and loaded locally
- Doing so can reduce time to train model again

In [12]:
model.save('word2vec_model')

In [13]:
model = Word2Vec.load('word2vec_model')

### Similarity calculation
- Similarity between embedded words (i.e., vectors) can be computed using metrics such as cosine similarity
- For other metrics and comparisons between them, refer to: https://github.com/taki0112/Vector_Similarity

In [16]:
from gensim.models import KeyedVectors

In [17]:
keyed_vectors = model.wv
keyed_vectors.most_similar('hamlet')

[('horatio', 0.9926713109016418),
 ('queene', 0.9913420677185059),
 ('polonius', 0.9902440309524536),
 ('laertes', 0.9900107383728027),
 ('king', 0.9898088574409485),
 ('ophelia', 0.9890589714050293),
 ('ghost', 0.9884952306747437),
 ('marcellus', 0.9873592853546143),
 ('heere', 0.9870586395263672),
 ('oh', 0.9868582487106323)]

In [14]:
#model.most_similar('hamlet')

AttributeError: 'Word2Vec' object has no attribute 'most_similar'

In [19]:
v1 = keyed_vectors['king']
v2 = keyed_vectors['queen']

In [20]:
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [21]:
cosine_similarity(v1, v2)

0.9828093647956848