In [1]:
import numpy as np
import sys
import time
import random
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt

sys.path.append('/home/kaliayev/Documents/ENSAE/elements_logiciels/word2vec_eltdm')

%load_ext autoreload
%autoreload 2

In [2]:
from word2vec_eltdm.word2vec_numpy import (
    Tokenizer, VocabCreator, DataLoader, TokenCleaner, Preprocessor,
    Subsampler, SimpleWord2Vec, Optimizer, CrossEntropy, train, validate
)

## Get data and create vocab

In [3]:
datapath = "../data/text8.txt"

tokenizer = Tokenizer(datapath)
token_cleaner = TokenCleaner(freq_threshold=3)
vocab_creator = VocabCreator()
text8_dataset = Preprocessor(tokenizer, token_cleaner, vocab_creator).preprocess()

In [4]:
print("Size of our vocabulary:", len(text8_dataset.tokens_to_id))

Size of our vocabulary: 80383


In [5]:
print("Number of tokens in our train dataset:", len(text8_dataset.train_tokens))

Number of tokens in our train dataset: 7986226


In [6]:
print("Number of tokens in our val dataset:", len(text8_dataset.val_tokens))

Number of tokens in our val dataset: 1331038


In [7]:
print("Number of tokens in our test dataset:", len(text8_dataset.test_tokens))

Number of tokens in our test dataset: 1331038


## Subsampling of frequent words, as in Mikolov 2013.

In [8]:
subsampler = Subsampler(text8_dataset.train_tokens)
text8_dataset.train_tokens = subsampler.subsample()

In [9]:
print("Size of our vocabulary after subsampling of frequent words, for train:", len(text8_dataset.tokens_to_id))

Size of our vocabulary after subsampling of frequent words, for train: 80383


In [10]:
print("Number of tokens in train dataset:", len(text8_dataset.train_tokens))

Number of tokens in train dataset: 2893184


## DataLoader

In [11]:
window = 5
batch_size = 256
train_dataloader = DataLoader(text8_dataset, text8_dataset.train_tokens, window, batch_size)
val_dataloader = DataLoader(text8_dataset, text8_dataset.val_tokens, window, batch_size)
test_dataloader = DataLoader(text8_dataset, text8_dataset.test_tokens, window, batch_size)

## Model

In [12]:
# defining the parameters
len_vocab = len(text8_dataset.tokens_to_id)
num_layer = 1
hidden_size = 500
embedding_size = 300
learning_rate = 1e-3

# instantiate the model
model = SimpleWord2Vec(
    len_vocab,
    num_layer,
    hidden_size,
    embedding_size
)
model.initialize_weights()

# using the loss that we defined
criterion = CrossEntropy() 
optimizer = Optimizer(model,learning_rate)

## Training

In [None]:
# train for some number of epochs
epochs = 15
train_loss_history = []
val_loss_history = []
tbar = trange(epochs)
for epoch in tbar:
    print(f"###################### EPOCH {epoch} ###########################")
    train_loss = train(model, train_dataloader, criterion, optimizer)
    print("Training loss:", train_loss)
    train_loss_history.append(train_loss)
    
    if epoch % 10 == 0:
        validation_loss = validate(model, val_dataloader, criterion)
        print("Validation loss:", validation_loss)
        val_loss_history.append(validation_loss)

  0%|          | 0/15 [00:00<?, ?it/s]

###################### EPOCH 0 ###########################


0it [00:00, ?it/s]

In [None]:
plt.plot(val_loss_history, label = "Validation Loss")
plt.plot(train_loss_history, label = "Train Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend() 
plt.title('Training and Validation Loss')
plt.show() 

In [None]:
print(model.best_W1)

In [None]:
print(model.best_W2)

## Results on test set