In [1]:
import re
import time
import numpy as np
import pandas as pd

from tqdm import tqdm, trange
from pathlib import Path
from functools import partial
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec

embed_dim = 50
epochs = 1_000

n_lines = 2809381
data_path = Path("..") / "data"
warmstarted_embeddings_path = data_path/"warm-started-embeddings"/f"glove.6B.en-pt.{embed_dim}d.txt.sample"

paracrawl_crosslingual_all_data_path = data_path / "parallel" / "paracrawl.en-pt" / "paracrawl.crosslingual.en-pt.all.sample"

train_log_path = data_path / "train_logs" / f"loss_warm_started_{embed_dim}d_{epochs}epochs_{int(time.time())}.txt"
checkpoints_path = data_path / "checkpoints"
results_path = data_path / "results" / f"word2vec_warmstarted_trained_embeddings_{embed_dim}d_{epochs}epochs.txt"

remove_punct_regex = re.compile(r"[^\w\s]")
remove_punct = partial(remove_punct_regex.sub, repl = "")

In [49]:
class ModelCheckpoint(CallbackAny2Vec):
    def __init__(self, checkpoints_path):
        self.epoch = 0
        self.checkpoints_path = checkpoints_path

    def on_epoch_end(self, model):
        if (self.epoch + 1) % 10 == 0:
            filepath = self.checkpoints_path/f"word2vec-warmstart-{model.trainables.layer1_size}d-epoch{self.epoch + 1}.model"
            with open(filepath, "w") as f:
                filename = f.name
                model.save(filename)
            
            model.wv.save_word2vec_format(str(results_path))

class LossLogger(CallbackAny2Vec):
    def __init__(self, train_log_path):
        self.epoch = 0
        self.batch = 0
        self.train_log_path = train_log_path

    def on_train_begin(self, model):
        print("[ ] Starting Word2Vec training...")
        self.start_time = time.time()

    def on_train_end(self, model):
        print(f"[ ] Word2Vec finished training in {time.time() - self.start_time} seconds")
        self.start_time = time.time()

    def on_epoch_start(self, model):
        self.batch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print(f'[ ] Loss after epoch {self.epoch}: {loss}')

        with open(self.train_log_path, "a") as f:
            f.write(str(loss) + "\n")
        
        self.epoch = self.epoch + 1


def preprocess_line(line):
    return remove_punct(string = line.replace("\n", "").lower()).split()

def read_corpus(files, shuffle_lines = False):
    sentences = []
    for file_path in files:
        print(f"[ ] Reading file {file_path}")
        time.sleep(1)
        with open(file_path, 'r', encoding = 'utf8') as f:
            for line in tqdm(f, total = n_lines):
                sentences.append((preprocess_line(line)))

    if shuffle_lines:
        np.random.shuffle(sentences)
    return sentences

def single_file_corpus(file):
    with open(file, 'r', encoding = 'utf8') as f:
        for line in tqdm(f, total = 4*n_lines):
            yield preprocess_line(line)

In [98]:
warmstart_embeddings = KeyedVectors.load_word2vec_format(warmstarted_embeddings_path, binary=False)

In [104]:
model = Word2Vec(size = embed_dim, 
                     iter = 10,
                     min_count = 1, 
                     compute_loss = True, 
                     callbacks = [ModelCheckpoint(checkpoints_path), LossLogger(train_log_path)]
                    )
# corpus = list(single_file_corpus(paracrawl_crosslingual_all_data_path))
model.build_vocab(corpus_file = paracrawl_crosslingual_all_data_path)
total_examples = model.corpus_count
model.build_vocab([list(warmstart_embeddings.vocab.keys())], update=True)
model.intersect_word2vec_format(warmstarted_embeddings_path, binary=False, lockf=1.0)
model.train(corpus_file = paracrawl_crosslingual_all_data_path, total_examples = total_examples, epochs=model.epochs, compute_loss = True)

[ ] Starting Word2Vec training...


ValueError: total_words must be provided alongside corpus_file argument.

In [102]:
model = Word2Vec(size = embed_dim, 
                     iter = 10,
                     min_count = 1, 
                     compute_loss = True, 
                     callbacks = [ModelCheckpoint(checkpoints_path), LossLogger(train_log_path)]
                    )
corpus = list(single_file_corpus(paracrawl_crosslingual_all_data_path))
model.build_vocab(corpus)
total_examples = model.corpus_count
model.build_vocab([list(warmstart_embeddings.vocab.keys())], update=True)
model.intersect_word2vec_format(warmstarted_embeddings_path, binary=False, lockf=1.0)
model.train(corpus, total_examples = total_examples, epochs=model.epochs, compute_loss = True)

  0%|          | 10000/11237524 [00:00<01:23, 135042.24it/s]
[ ] Starting Word2Vec training...
[ ] Loss after epoch 0: 215512.0625
[ ] Loss after epoch 1: 401047.5
[ ] Loss after epoch 2: 573787.5
[ ] Loss after epoch 3: 738846.3125
[ ] Loss after epoch 4: 896090.9375
[ ] Loss after epoch 5: 1029771.0625
[ ] Loss after epoch 6: 1176073.375
[ ] Loss after epoch 7: 1318633.0
[ ] Loss after epoch 8: 1459131.25
[ ] Loss after epoch 9: 1598792.25
[ ] Word2Vec finished training in 1.4460091590881348 seconds


(2210060, 2210060)

In [78]:
print("[ ] Initializing model...")
model = Word2Vec(size=embed_dim, 
                 iter = 11,
                 min_count=1, 
                 compute_loss = True, 
                 callbacks=[ModelCheckpoint(checkpoints_path), LossLogger(train_log_path)]
                )

[ ] Initializing model...


In [79]:
print("[ ] Loading warmstarted embeddings...")
warmstart_embeddings = KeyedVectors.load_word2vec_format(warmstarted_embeddings_path, binary=False)
len(warmstart_embeddings.vocab.keys())

[ ] Loading warmstarted embeddings...


2323

In [80]:
print("[ ] Building vocab...")
time.sleep(1)
corpus = single_file_corpus(paracrawl_crosslingual_all_data_path)
model.build_vocab(corpus)
total_examples = model.corpus_count

[ ] Building vocab...
  0%|          | 10000/11237524 [00:00<02:16, 82133.03it/s]


In [82]:
print("[ ] Updating vocab...")
model.build_vocab([list(warmstart_embeddings.vocab.keys())], update=True)

[ ] Updating vocab...


In [75]:
print("[ ] Warm-starting embeddings...")
model.intersect_word2vec_format(warmstarted_embeddings_path, binary=False, lockf=0)

[ ] Warm-starting embeddings...


In [76]:
model.train(corpus, total_examples=total_examples, epochs=model.iter, compute_loss = True)

[ ] Starting Word2Vec training...
[ ] Loss after epoch 0: 0
[ ] Loss after epoch 1: 0
[ ] Loss after epoch 2: 0
[ ] Loss after epoch 3: 0
[ ] Loss after epoch 4: 0
[ ] Loss after epoch 5: 0
[ ] Loss after epoch 6: 0
[ ] Loss after epoch 7: 0
[ ] Loss after epoch 8: 0
[ ] Loss after epoch 9: 0
[ ] Loss after epoch 10: 0
[ ] Word2Vec finished training in 0.04525399208068848 seconds


(0, 0)

In [None]:
Word2Vec?