In [1]:
import numpy as np
import sys
import time
import random
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
import torch
from copy import deepcopy
import pickle 

sys.path.append('/home/kaliayev/Documents/ENSAE/elements_logiciels/word2vec_eltdm')

%load_ext autoreload
%autoreload 2

In [2]:
from word2vec_eltdm.word2vec_numpy import (
    Tokenizer, VocabCreator, DataLoader, TokenCleaner, Preprocessor,
    Subsampler, SimpleWord2Vec, Optimizer, CrossEntropy, train, validate, NegWord2Vec,
    NegativeSamplingLoss, OptimizeNSL
)

## Get data and create vocab

In [3]:
datapath = "../data/text8.txt"

RATIO = 1
return_only_train = True
tokenizer = Tokenizer(datapath)
token_cleaner = TokenCleaner(freq_threshold=5)
vocab_creator = VocabCreator()
text8_dataset = Preprocessor(tokenizer, token_cleaner, vocab_creator, RATIO, return_only_train).preprocess()

In [4]:
print("Size of our vocabulary:", len(text8_dataset.tokens_to_id))

Size of our vocabulary: 63492


In [5]:
print("Number of tokens in our train dataset:", len(text8_dataset.train_tokens))

Number of tokens in our train dataset: 10566033


In [6]:
print("Number of tokens in our val dataset:", len(text8_dataset.val_tokens))

Number of tokens in our val dataset: 0


In [7]:
print("Number of tokens in our test dataset:", len(text8_dataset.test_tokens))

Number of tokens in our test dataset: 0


## Subsampling of frequent words, as in Mikolov 2013.

In [8]:
subsampler = Subsampler(text8_dataset.train_tokens)
text8_dataset.train_tokens, text8_dataset.frequencies = subsampler.subsample()

In [9]:
print("Size of our vocabulary after subsampling of frequent words, for train:", len(text8_dataset.tokens_to_id))

Size of our vocabulary after subsampling of frequent words, for train: 63492


In [10]:
print("Number of tokens in train dataset:", len(text8_dataset.train_tokens))

Number of tokens in train dataset: 3767955


## DataLoader

In [11]:
window = 5
batch_size = 256
train_dataloader = DataLoader(text8_dataset, text8_dataset.train_tokens, window, batch_size)

# No need for validation and test sets
# val_dataloader = DataLoader(text8_dataset, text8_dataset.val_tokens, window, batch_size)
# test_dataloader = DataLoader(text8_dataset, text8_dataset.test_tokens, window, batch_size)

## Model

In [12]:
# defining the parameters
len_vocab = len(text8_dataset.tokens_to_id)
hidden_size = 500
embedding_size = 300
learning_rate = 1e-3
n_samples = 5
BEST_VAL_LOSS = 5.86

# Get our noise distribution
word_freqs = np.array(sorted(text8_dataset.frequencies.values(), reverse=True))
unigram_dist = word_freqs / word_freqs.sum()
noise_dist = unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75))

# instantiate the model
model = NegWord2Vec(
    len_vocab,
    hidden_size,
    embedding_size,
    noise_dist=noise_dist,
    best_val_loss=BEST_VAL_LOSS
)
model.initialize_weights()

# Load previous model

#with open("../models/NegWord2Vec_5.869696641951837.p", "rb") as file:
#    model = pickle.load(file)
#model = model["NegWord2Vec"]

# using the loss that we defined
criterion = NegativeSamplingLoss() 
optimizer = OptimizeNSL(model,learning_rate)

## Training

In [13]:
def train(model, train_dataloader, criterion, optimizer):
    train_loss = 0.0
    for i, batch in enumerate(tqdm(train_dataloader)):
        model.train()
        X, y = batch["X"], batch["Y"]
        h = model.forward_input(X)
        u = model.forward_output(y)
        noise_vector = model.forward_noise(X.shape[1], n_samples)

        # negative sampling loss
        loss, grad_W1, grad_W2 = criterion(model, h, u, noise_vector, y)
        optimizer.step(grad_W1, grad_W2)

        train_loss += loss

        if i % 10000 == 0:
            print(
                "Current Training Loss {:.6}".format(loss)
            )

    train_loss /= len(train_dataloader)
    return train_loss


def update_best_loss(model, val_loss):
    # Update the model and best loss if we see improvements.
    if not model.best_val_loss or val_loss < model.best_val_loss:
        model.best_val_loss = val_loss
        model.best_W1 = deepcopy(model.W1)
        model.best_W2 = deepcopy(model.W2)
        print(f"Now best model has {val_loss} loss")
        model.save_model()


def validate(model, dataloader, criterion):
    model.eval()
    validation_loss = 0
    for i, batch in enumerate(tqdm(dataloader)):
        X, y = batch["X"], batch["Y"]
        h = model.forward_input(X)
        u = model.forward_output(y)
        noise_vector = model.forward_noise(X.shape[1], n_samples)
        
        loss, _, _ = criterion(model, h, u, noise_vector, y)
        validation_loss += loss

    validation_loss /= len(dataloader)

    # Keep track of the best model
    update_best_loss(model, validation_loss)

    print("Validation Loss: ", validation_loss)

    return validation_loss

In [None]:
# train for some number of epochs
epochs = 50
train_loss_history = []
val_loss_history = []
tbar = trange(epochs)
for epoch in tbar:
    print(f"###################### EPOCH {epoch} ###########################")
    train_loss = train(model, train_dataloader, criterion, optimizer)
    print("Training loss:", train_loss)
    train_loss_history.append(train_loss)
    
    # Keep track of the best model
    update_best_loss(model, train_loss)
    
    # No need for validation 
    
    #val_loss_history.append(val_loss)
    #val_loss = validate(model, val_dataloader, criterion)
    #print("Last validation loss:", val_loss)
    #val_loss_history.append(val_loss)

  0%|          | 0/50 [00:00<?, ?it/s]

###################### EPOCH 0 ###########################


  0%|          | 0/14719 [00:00<?, ?it/s]

Current Training Loss 14.251


In [None]:
plt.plot(val_loss_history, label = "Validation Loss")
plt.plot(train_loss_history, label = "Train Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend() 
plt.title('Training and Validation Loss')
plt.show() 

In [None]:
print(model.best_W1)

In [None]:
print(model.best_W2)

In [None]:
break

## Results on test set