In [1]:
import numpy as np
import sys
import time
import random
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import torch
from copy import deepcopy
import pickle 

sys.path.append('/home/kaliayev/Documents/ENSAE/elements_logiciels/word2vec_eltdm')

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from word2vec_eltdm.word2vec_numpy import (
    Tokenizer, VocabCreator, DataLoader, TokenCleaner, Preprocessor,
    Subsampler, SimpleWord2Vec, Optimizer, CrossEntropy, NegWord2Vec,
    NegativeSamplingLoss, OptimizeNSL, evaluate, visualization_tsne, train_NSL,
    update_best_loss
)

## Get data and create vocab

In [3]:
datapath = "../data/text8.txt"

RATIO = 1
return_only_train = True
tokenizer = Tokenizer(datapath)
token_cleaner = TokenCleaner(freq_threshold=5)
vocab_creator = VocabCreator()
text8_dataset = Preprocessor(tokenizer, token_cleaner, vocab_creator, RATIO, return_only_train).preprocess()

In [4]:
print("Size of our vocabulary:", len(text8_dataset.tokens_to_id))

Size of our vocabulary: 63492


In [5]:
print("Number of tokens in our train dataset:", len(text8_dataset.train_tokens))

Number of tokens in our train dataset: 10566033


In [6]:
print("Number of tokens in our val dataset:", len(text8_dataset.val_tokens))

Number of tokens in our val dataset: 0


In [7]:
print("Number of tokens in our test dataset:", len(text8_dataset.test_tokens))

Number of tokens in our test dataset: 0


## Subsampling of frequent words, as in Mikolov 2013.

In [8]:
subsampler = Subsampler(text8_dataset.train_tokens)
text8_dataset.train_tokens, text8_dataset.frequencies = subsampler.subsample()

In [9]:
print("Size of our vocabulary after subsampling of frequent words, for train:", len(text8_dataset.tokens_to_id))

Size of our vocabulary after subsampling of frequent words, for train: 63492


In [10]:
print("Number of tokens in train dataset:", len(text8_dataset.train_tokens))

Number of tokens in train dataset: 3767955


## DataLoader

In [11]:
window = 5
batch_size = 1024
train_dataloader = DataLoader(text8_dataset, text8_dataset.train_tokens, window, batch_size)

## Model

In [12]:
# defining the parameters
len_vocab = len(text8_dataset.tokens_to_id)
embedding_size = 300
learning_rate = 5e-2
n_samples = 5
BEST_VAL_LOSS = 2.84
epochs = 10

# hyperparameters for optimizer
decay_rate = learning_rate / epochs
method = "time_based" # or "none", "exp_decay", "step_decay", "time_based"

# Get our noise distribution
word_freqs = np.array(sorted(text8_dataset.frequencies.values(), reverse=True))
unigram_dist = word_freqs / word_freqs.sum()
noise_dist = unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75))

# instantiate the model
model = NegWord2Vec(
    len_vocab,
    embedding_size,
    noise_dist=noise_dist,
    best_val_loss=BEST_VAL_LOSS
)
model.initialize_weights()

# Load previous model

#with open("../word2vec_eltdm/models/NegWord2Vec_4.743055158391199.p", "rb") as file:
#    model = pickle.load(file)
#model = model["NegWord2Vec"]

# using the loss that we defined
criterion = NegativeSamplingLoss() 
optimizer = OptimizeNSL(model,learning_rate, decay_rate, method)

## Training

In [None]:
# train for some number of epochs
train_loss_history = []
tbar = trange(epochs)

for epoch in tbar:
    print(f"###################### EPOCH {epoch} ###########################")
    
    train_loss = train_NSL(model, train_dataloader, criterion, optimizer, n_samples)
    print("Training loss:", train_loss)
    train_loss_history.append(train_loss)
    
    # Keep track of the best model
    update_best_loss(model, train_loss)
    
    # update learning rate 
    optimizer.update_lr(epoch)
    
    embeddings = model.W1
    evaluate(embeddings, text8_dataset.id_to_tokens, nb_words=20)

  0%|          | 0/10 [00:00<?, ?it/s]

###################### EPOCH 0 ###########################


  0%|          | 0/3680 [00:00<?, ?it/s]

In [None]:
break

## Evaluation on the task of word similarities

In [None]:
filepath = "../word2vec_eltdm/models/NegWord2Vec_4.753405412967128.p"
with open(filepath, "rb") as file:
    model = pickle.load(file)

In [None]:
model = model["NegWord2Vec"]
embeddings = model.best_W1

### Evaluate using cosine similarity

In [None]:
evaluate(embeddings, text8_dataset.id_to_tokens, nb_words=20)

### t-SNE embedding visualisation

In [None]:
visualization_tsne(embeddings, text8_dataset.id_to_tokens, nb_words = 400)