In [1]:
import numpy as np
import sys
import time
import random
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import torch
from copy import deepcopy
import pickle 

sys.path.append('/home/kaliayev/Documents/ENSAE/elements_logiciels/word2vec_eltdm')

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from word2vec_eltdm.word2vec_numpy import (
    Tokenizer, VocabCreator, DataLoader, TokenCleaner, Preprocessor,
    Subsampler, SimpleWord2Vec, Optimizer, CrossEntropy, NegWord2Vec,
    NegativeSamplingLoss, OptimizeNSL, evaluate, visualization_tsne, train_NSL,
    update_best_loss
)

## Get data and create vocab

In [3]:
datapath = "../data/text8.txt"

RATIO = 1
return_only_train = True
tokenizer = Tokenizer(datapath)
token_cleaner = TokenCleaner(freq_threshold=5)
vocab_creator = VocabCreator()
text8_dataset = Preprocessor(tokenizer, token_cleaner, vocab_creator, RATIO, return_only_train).preprocess()

In [4]:
print("Size of our vocabulary:", len(text8_dataset.tokens_to_id))

Size of our vocabulary: 63492


In [5]:
print("Number of tokens in our train dataset:", len(text8_dataset.train_tokens))

Number of tokens in our train dataset: 10566033


In [6]:
print("Number of tokens in our val dataset:", len(text8_dataset.val_tokens))

Number of tokens in our val dataset: 0


In [7]:
print("Number of tokens in our test dataset:", len(text8_dataset.test_tokens))

Number of tokens in our test dataset: 0


## Subsampling of frequent words, as in Mikolov 2013.

In [8]:
subsampler = Subsampler(text8_dataset.train_tokens)
text8_dataset.train_tokens, text8_dataset.frequencies = subsampler.subsample()

In [9]:
print("Size of our vocabulary after subsampling of frequent words, for train:", len(text8_dataset.tokens_to_id))

Size of our vocabulary after subsampling of frequent words, for train: 63492


In [10]:
print("Number of tokens in train dataset:", len(text8_dataset.train_tokens))

Number of tokens in train dataset: 3767955


## DataLoader

In [11]:
window = 5
batch_size = 1024
train_dataloader = DataLoader(text8_dataset, text8_dataset.train_tokens, window, batch_size)

## Model

In [12]:
# defining the parameters
len_vocab = len(text8_dataset.tokens_to_id)
embedding_size = 300
learning_rate = 3e-3 # 5e-2
n_samples = 5
BEST_VAL_LOSS = 2.83
epochs = 10

# hyperparameters for optimizer
decay_rate = learning_rate / epochs
method = "none" # or "none", "exp_decay", "step_decay", "time_based"

# Get our noise distribution
word_freqs = np.array(sorted(text8_dataset.frequencies.values(), reverse=True))
unigram_dist = word_freqs / word_freqs.sum()
noise_dist = unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75))

# instantiate the model
model = NegWord2Vec(
    len_vocab,
    embedding_size,
    noise_dist=noise_dist,
    best_val_loss=BEST_VAL_LOSS
)
model.initialize_weights()

# Load previous model

#with open("../word2vec_eltdm/models/NegWord2Vec_4.743055158391199.p", "rb") as file:
#    model = pickle.load(file)
#model = model["NegWord2Vec"]

# using the loss that we defined
criterion = NegativeSamplingLoss() 
optimizer = OptimizeNSL(model,learning_rate, decay_rate, method)

## Training

In [13]:
# train for some number of epochs
train_loss_history = []
tbar = trange(epochs)

for epoch in tbar:
    print(f"###################### EPOCH {epoch} ###########################")
    
    train_loss = train_NSL(model, train_dataloader, criterion, optimizer, n_samples)
    print("Training loss:", train_loss)
    train_loss_history.append(train_loss)
    
    # Keep track of the best model
    update_best_loss(model, train_loss)
    
    # update learning rate 
    optimizer.update_lr(epoch)
    
    embeddings = model.W1
    evaluate(embeddings, text8_dataset.id_to_tokens, nb_words=20)

  0%|          | 0/10 [00:00<?, ?it/s]

###################### EPOCH 0 ###########################


  0%|          | 0/3680 [00:00<?, ?it/s]

Training loss: 4.1590862837596445
year | christi, smuggling, wollstonecraft, shooters, robin
x | transpired, sudden, sdr, strata, airs
external | resolution, comets, racks, starships, elsa
century | chos, testimonium, internationalized, condensing, gametophyte
links | discos, wife, lethal, outfielder, adsorption
english | yasunori, observer, throats, nborn, systematics
would | two, washington, tend, zero, mesaoria
states | desolation, slopes, toyota, carey, christiaan
united | tempo, invasive, inferior, eight, schmitz
series | wray, vernor, droit, dysprosium, barnum
remaining | andhra, kempe, mifepristone, kalevala, incidences
reached | motorway, hydrozoa, quoting, serco, evan
animal | singhasari, videos, anderssen, hogg, dumbarton
billion | gaited, rejection, relapses, bbc, sociologists
signed | nec, megali, blockades, volcanism, amazing
individuals | referendum, bahmani, clitoris, randle, hydroponic
hebrew | dpp, scatter, started, michio, reverses
location | deftones, coordinates, ot

  0%|          | 0/3680 [00:00<?, ?it/s]

Training loss: 4.085461579720116
century | zero, two, six, four, nine
would | two, zero, one, five, six
example | zero, two, three, one, seven
english | zero, four, six, one, two
states | one, zero, four, nine, three
known | five, zero, seven, three, one
like | zero, one, five, three, four
different | seven, zero, five, one, eight
much | one, two, zero, seven, five
war | zero, seven, also, one, six
wars | zero, three, one, seven, two
computers | nine, king, five, second, zero
medicine | smaller, usually, within, nine, h
motion | three, four, one, set, according
translation | often, may, even, five, would
supported | two, seven, three, one, called
martin | five, zero, many, one, six
users | zero, nine, though, six, first
electric | house, one, two, published, considered
bridge | one, would, popular, zero, often
###################### EPOCH 2 ###########################


  0%|          | 0/3680 [00:00<?, ?it/s]

Training loss: 3.614056432495379
history | made, include, modern, series, non
early | external, american, important, new, part
language | known, said, time, r, north
different | e, left, features, seven, way
also | r, war, william, known, another
name | even, others, seen, long, zero
well | following, many, groups, put, take
since | often, four, american, much, number
five | known, people, e, following, human
see | many, death, later, found, named
lack | example, political, r, external, time
billion | long, others, found, many, th
regarded | called, included, come, several, period
table | often, see, time, system, associated
internal | even, article, life, important, recent
response | part, external, would, known, also
hebrew | started, even, seven, home, return
composer | french, make, less, two, five
location | history, greek, even, may, six
translation | even, may, often, term, french
###################### EPOCH 3 ###########################


  0%|          | 0/3680 [00:00<?, ?it/s]

Training loss: 3.180150290967018
time | river, range, meaning, subsequent, authorities
several | know, published, children, fit, administrative
would | al, serious, friend, washington, bill
see | across, administrative, eye, categories, identical
based | al, talk, experimental, well, queen
may | temporary, civilization, steve, distinct, record
country | speak, austrian, initiative, vi, helps
list | visible, guide, reversed, persian, players
also | destruction, von, favor, detailed, chemistry
united | club, india, identified, acquired, founding
clear | air, evolved, novelist, marriage, aside
round | passing, block, might, finished, preceding
computers | distribution, linear, sweet, collaboration, king
province | sir, carl, le, tv, el
ship | sources, gradually, dry, competition, evidence
medicine | theme, rule, invented, el, smaller
daughter | talent, expanded, roots, author, bc
speech | contrary, collectively, dry, reference, run
environment | honor, novels, russian, least, book
beyond 

  0%|          | 0/3680 [00:00<?, ?it/s]

Training loss: 3.0162638267432844
states | slopes, river, carey, converting, demise
language | standing, camps, festival, visited, joy
year | smuggling, artistic, christi, hero, robin
de | intensity, occupied, henri, witnessed, penguin
new | rites, resurrection, scene, fiber, se
system | periodic, nose, embassy, associates, referendum
high | proclaiming, dr, terminated, shores, herd
number | hunter, sick, engraving, sizes, commonly
state | bishops, denounced, kent, successors, fearing
years | caution, beta, chemical, ottawa, starts
appointed | display, triple, dreams, designated, intensely
reading | object, resource, parameter, shepherd, bishop
supported | terrible, gases, mp, gear, franklin
consider | england, quit, heated, laureate, actress
conflict | sir, immense, bottom, hang, suffix
martin | decisive, dr, satellites, mr, joseph
environment | novels, rescue, honor, assets, dioxide
sexual | images, degree, gustave, du, smell
beyond | macmillan, consequences, become, laboratory, circ

  0%|          | 0/3680 [00:00<?, ?it/s]

Training loss: 2.9416344074863687
common | militias, accent, sino, solstice, downloaded
c | right, carol, proximity, lots, beverly
american | coastline, masterpiece, comprises, eventual, nottingham
would | rape, found, serious, charm, atlantis
state | bishops, tacitus, coincide, fearing, denounced
f | attained, dar, console, eventual, fuller
de | intensity, frankish, curb, occupied, henri
eight | vested, macintosh, revolutionary, violinist, moderate
english | observer, estimation, query, mercy, agents
first | ghost, valuable, cited, marcel, corresponded
temperature | purchases, sees, guided, orthodox, absorption
sold | beaten, employment, fruits, propaganda, trajectory
sexual | images, secondly, degree, gustave, smell
committee | firstly, mellon, bald, heaven, easter
motion | indiana, natalie, grades, simplicity, thriller
consider | england, quit, assorted, dining, peruvian
ed | consistency, minimize, princes, directing, precede
station | lord, heated, neal, rhythm, slashdot
martin | d

  0%|          | 0/3680 [00:00<?, ?it/s]

Training loss: 2.901502042916605
south | uttered, richter, immersion, dod, evangelicalism
game | handicrafts, drought, authorizing, put, interstellar
zero | many, native, political, eating, including
although | mess, pink, hemingway, steadfast, taipei
n | gunfire, really, filmmaking, busiest, youthful
modern | critique, analyst, slash, fermentation, habitats
new | rites, scene, resurrection, palmer, index
number | engraving, lawn, commonly, residue, fertilized
links | outfielder, pornographic, myanmar, vedic, swallow
american | coastline, nottingham, masterpiece, comprises, oats
calendar | prompting, cm, nigeria, kafka, empiricism
ed | consistency, precede, minimize, princes, cigarette
sold | umpires, beaten, josh, trajectory, lung
conflict | mez, hang, titanium, parry, immense
kind | aq, send, frankish, fats, adequately
users | retiring, uv, westminster, xiii, gottfried
purpose | intellectuals, extraordinary, rival, leads, fuselage
carbon | meiji, girolamo, tilted, cock, tactical
inte

  0%|          | 0/3680 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
evaluate(embeddings, text8_dataset.id_to_tokens, nb_words=20)

second | phonemic, known, yeast, pitched, cortex
e | faithfulness, textile, force, wide, meditating
form | tended, force, finding, jars, receipt
history | spit, extradition, unofficially, adolescent, romances
many | zero, see, well, three, first
state | influence, tacitus, fearing, bishops, polite
five | yards, people, known, procedures, ut
different | tl, deformed, rfc, utrecht, rejecting
common | militias, solstice, liszt, counteract, sino
eight | every, vested, zero, revolutionary, salinity
environment | dive, chasing, dioxide, honor, assets
noted | testify, conspiring, sensations, spending, unfavorable
temperature | shlomo, purchases, tokugawa, boehm, vicar
performed | verona, white, asimov, repeat, calculus
values | lemma, kt, extravagant, affiliations, surveyor
wars | vegetarian, elders, receives, deputies, avec
miles | owed, nig, frankie, blondie, canine
approximately | proportions, militant, appended, presidential, shinto
province | conspicuous, tooth, qui, informs, stem
speaki

In [None]:
break

## Evaluation on the task of word similarities

In [None]:
filepath = "../word2vec_eltdm/models/NegWord2Vec_4.753405412967128.p"
with open(filepath, "rb") as file:
    model = pickle.load(file)

In [None]:
model = model["NegWord2Vec"]
embeddings = model.best_W1

### Evaluate using cosine similarity

In [None]:
evaluate(embeddings, text8_dataset.id_to_tokens, nb_words=20)

### t-SNE embedding visualisation

In [None]:
visualization_tsne(embeddings, text8_dataset.id_to_tokens, nb_words = 400)