In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from glob import glob
import math
import pickle
from nltk.tokenize import word_tokenize, sent_tokenize

from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

This notebook was used to train word embeddings using the gathered data from the Brown, OANC, BNC corpora.

# train word2vec embedding using gensim

In [5]:
sentences = list(pd.read_pickle('data/lexical_repl/sents_df.zip')['sent'])

In [6]:
batch_words = 10000
num_words = 0
for item in tqdm(sentences):
    num_words += len(item)

HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))




In [7]:
class EpochMonitor(CallbackAny2Vec):
    '''Callback to log information about training'''
    
    def __init__(self, num_words, batch_words):
        self.batch_words = batch_words
        self.num_words = num_words
        self.num_batches = math.ceil(self.num_words / self.batch_words)
        self.epoch = 1
        self.current_batch = 1
        
    def on_batch_begin(self, model):
        if self.current_batch % int(self.num_batches / 5) == 0:
            print("{0:.0%}".format(self.current_batch/self.num_batches))
        
    def on_batch_end(self, model):
        self.current_batch += 1
    
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
        self.current_batch = 1
    
    def on_epoch_end(self, model):
        self.epoch += 1
        
epoch_monitor = EpochMonitor(num_words, batch_words)

In [29]:
model = Word2Vec(sentences, 
                 size = 100, 
                 window = 5, 
                 min_count = 5, 
                 workers = 12, 
                 sg = 0, 
                 iter = 10, 
                 batch_words = batch_words,
                 callbacks = [epoch_monitor]) 
#sg=0: CBOW > skipgram
# most default parameters kept
model.save('data/lexical_repl/w2v.model')

Epoch #1 start
20%
40%
60%
80%
100%
Epoch #2 start
20%
40%
60%
80%
100%
Epoch #3 start
20%
40%
60%
80%
100%
Epoch #4 start
20%
40%
60%
80%
100%
Epoch #5 start
20%
40%
60%
80%
100%
Epoch #6 start
20%
40%
60%
80%
100%
Epoch #7 start
20%
40%
60%
80%
100%
Epoch #8 start
20%
40%
60%
80%
100%
Epoch #9 start
20%
40%
60%
80%
100%
Epoch #10 start
20%
40%
60%
80%
100%


In [48]:
len(model.wv.vocab)

148301

In [44]:
model.wv.most_similar(positive = ['queen', 'man'], negative = ['woman'])

[('king', 0.7727294564247131),
 ('knight', 0.7580963373184204),
 ('prince', 0.7514700889587402),
 ('warrior', 0.7270703315734863),
 ('chieftain', 0.7142376899719238),
 ('protector', 0.7099010348320007),
 ('grandson', 0.7018693685531616),
 ('vassal', 0.6943684816360474),
 ('princess', 0.6851438283920288),
 ('Everqueen', 0.6839128136634827)]

# prepare embedding for use in model

We need to make:
<br>- word2index dictionary
<br>- index2word dictionary
<br>- embedding matrix

Keep in mind:
<br>- we will use the mask_zero functionality, so we must be sure the 0th position in the embedding matrix is a random vector/all zeroes/it doesn't particularly matter and that the 0th position doesn't lead to anything in either dictionary; it will be used for padding
<br>- we need to add the start and end of sentence markers
<br>- we also must add a randomly initialized UNK token, to insert a placeholder for words unknown to the model

In [56]:
embedding_matrix = np.zeros((len(model.wv.vocab) + 1, 100))
w2idx = {}
idx2w = {}

for i in tqdm(range(1, len(model.wv.vocab) + 1)):
    word = model.wv.index2word[i - 1]
    
    w2idx[word] = i
    idx2w[i] = word
    
    embedding_vector = model.wv[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

HBox(children=(IntProgress(value=0, max=148301), HTML(value='')))




In [60]:
# SOS start-of-sequence tag is '\t'
np.random.seed(47)
idx2w[embedding_matrix.shape[0]] = '\t'
w2idx['\t'] = embedding_matrix.shape[0]
embedding_matrix = np.append(embedding_matrix, np.random.rand(1, 100), axis=0)

# EOS end-of-sequence tag is '\n'
np.random.seed(42)
idx2w[embedding_matrix.shape[0]] = '\n'
w2idx['\n'] = embedding_matrix.shape[0]
embedding_matrix = np.append(embedding_matrix, np.random.rand(1, 100), axis=0)

In [18]:
np.random.seed(26)
idx2w[embedding_matrix.shape[0]] = '[UNK]'
w2idx['[UNK]'] = embedding_matrix.shape[0]
embedding_matrix = np.append(embedding_matrix, np.random.rand(1, 100), axis=0)

In [25]:
with open('data/lexical_repl/embedding.pkl', 'wb') as f:
    pickle.dump(embedding_matrix, f)
    
with open('data/lexical_repl/idx2w.pkl', 'wb') as f:
    pickle.dump(idx2w, f)
    
with open('data/lexical_repl/w2idx.pkl', 'wb') as f:
    pickle.dump(w2idx, f)

# prepare for use with openNMT

In [8]:
model = Word2Vec.load('data/lexical_repl/w2v.model')

In [9]:
len(model.wv.vocab)

148301

In [11]:
model.wv.save_word2vec_format('data/lexical_repl_models/w2v_vectors.txt', binary=False)