In [None]:
import numpy as np
import sys
import time
import random
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pickle

import torch
import torch.nn as nn
import torch.optim as optim

sys.path.append('/home/kaliayev/Documents/ENSAE/elements_logiciels/word2vec_eltdm')

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
from word2vec_eltdm.common import Tokenizer, VocabCreator, DataLoader, TokenCleaner, Preprocessor, Subsampler, evaluate
from word2vec_eltdm.word2vec_accelerated import PytorchNegWord2Vec, NegativeSamplingLoss, train_NSL, update_best_loss

## Get data and create vocab

In [None]:
datapath = "../data/text8.txt"

RATIO = 0.2
return_only_train = True
tokenizer = Tokenizer(datapath)
token_cleaner = TokenCleaner(freq_threshold=5)
vocab_creator = VocabCreator()
text8_dataset = Preprocessor(tokenizer, token_cleaner, vocab_creator, RATIO, return_only_train).preprocess()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
print("Size of our vocabulary:", len(text8_dataset.tokens_to_id))

In [None]:
print("Number of tokens in our train dataset:", len(text8_dataset.train_tokens))

In [None]:
print("Number of tokens in our val dataset:", len(text8_dataset.val_tokens))

In [None]:
print("Number of tokens in our test dataset:", len(text8_dataset.test_tokens))

## Subsampling of frequent words, as in Mikolov 2013.

In [None]:
subsampler = Subsampler(text8_dataset.train_tokens)
text8_dataset.train_tokens, text8_dataset.frequencies = subsampler.subsample()

In [None]:
print("Size of our vocabulary after subsampling of frequent words, for train:", len(text8_dataset.tokens_to_id))

In [None]:
print("Number of tokens in train dataset:", len(text8_dataset.train_tokens))

## DataLoader

In [None]:
window = 5
batch_size = 512
train_dataloader = DataLoader(text8_dataset, text8_dataset.train_tokens, window, batch_size)

## Model

In [None]:
# defining the parameters
len_vocab = len(text8_dataset.tokens_to_id)
embedding_size = 300
learning_rate = 0.003
n_samples = 5
epochs = 10

# Get our noise distribution
word_freqs = np.array(sorted(text8_dataset.frequencies.values(), reverse=True))
unigram_dist = word_freqs / word_freqs.sum()
noise_dist = unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75))

# instantiate the model
model = PytorchNegWord2Vec(
    len_vocab,
    embedding_size,
    noise_dist=noise_dist,
).to(device)
model.initialize_weights()

criterion = NegativeSamplingLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

## Training

In [None]:
# train for some number of epochs
train_loss_history = []
tbar = trange(epochs)

for epoch in tbar:
    print(f"###################### EPOCH {epoch} ###########################")

    train_loss = train_NSL(model, train_dataloader, criterion, optimizer, n_samples)
    print("Training loss:", train_loss.item())
    train_loss_history.append(train_loss.item())

    # Keep track of the best model
    update_best_loss(model, train_loss)

    embeddings = model.embedding_input.weight.data.detach().cpu().numpy()
    evaluate(embeddings, text8_dataset.id_to_tokens, nb_words=20)

## Evaluation on the task of word similarities

In [None]:
filepath = "../word2vec_eltdm/models/NegWord2Vec_2.7252233850736785.p"
with open(filepath, "rb") as file:
    model = pickle.load(file)

In [None]:
model = model["NegWord2Vec"]
embeddings = model.embedding_input.weight.data.numpy()

### Evaluate using cosine similarity

In [None]:
evaluate(embeddings, text8_dataset.id_to_tokens, nb_words=20)

### t-SNE embedding visualisation

In [None]:
visualization_tsne(embeddings, text8_dataset.id_to_tokens, nb_words = 400)
