In [1]:
from pathlib import Path
import sys
project_dir = str( Path().resolve().parent )
sys.path += [ project_dir ]
from examples.embeddings.word2vec import SkipGramReader

In [2]:
reader = SkipGramReader()
text8 = reader.read('https://realworldnlpbook.s3.amazonaws.com/data/text8/text8')

In [3]:
from collections import Counter
 
import torch
import torch.optim as optim
from allennlp.data.data_loaders import SimpleDataLoader
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.token_embedders import Embedding
#Note type in text 
#from allennlp.training.trainer import GradientDescentTrainer
from allennlp.training import GradientDescentTrainer
from torch.nn import CosineSimilarity
from torch.nn import functional
 
EMBEDDING_DIM = 256
BATCH_SIZE = 256

In [4]:
#vocab = Vocabulary.from_instances(
#    text8, min_count={'token_in': 5, 'token_out': 5})

In [5]:
#data_loader = SimpleDataLoader(text8, batch_size=BATCH_SIZE)
#data_loader.index_with(vocab)

In [6]:
#embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
#                         embedding_dim=EMBEDDING_DIM)

In [7]:
class SkipGramModel(Model):
    def __init__(self, vocab, embedding_in):
        super().__init__(vocab)
        
        self.embedding_in = embedding_in
        self.linear = torch.nn.Linear(
            in_features=EMBEDDING_DIM,
            out_features=vocab.get_vocab_size('token_out'),
            bias=False)
    
    def forward(self, token_in, token_out):
        embedded_in = self.embedding_in(token_in)
        logits = self.linear(embedded_in)
        loss = functional.cross_entropy(logits, token_out)
        return {'loss': loss}

In [8]:
reader = SkipGramReader()
text8 = reader.read("https://realworldnlpbook.s3.amazonaws.com/data/text8/text8")
 
vocab = Vocabulary.from_instances(
    text8, min_count={'token_in': 5, 'token_out': 5})
 
data_loader = SimpleDataLoader(text8, batch_size=BATCH_SIZE)
data_loader.index_with(vocab)
 
embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                         embedding_dim=EMBEDDING_DIM)

model = SkipGramModel(vocab=vocab,
                      embedding_in=embedding_in)
optimizer = optim.Adam(model.parameters())

trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=data_loader,
    num_epochs=5,
#SET cuda_device = -1 to specify no GPU
    cuda_device=-1)
trainer.train()

building vocab: 0it [00:00, ?it/s]

Your label namespace was 'token_in'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.
Your label namespace was 'token_out'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

{'best_epoch': 4,
 'peak_worker_0_memory_MB': 1501.828125,
 'training_duration': '0:00:00.506435',
 'epoch': 4,
 'training_loss': 0.0,
 'training_worker_0_memory_MB': 1501.828125}

3.3. Method to obtain related words using Word embeddings

In [14]:
def get_related(token: str, embedding: Model, vocab: Vocabulary, 
                num_synonyms: int = 10):
    token_id = vocab.get_token_index(token, 'token_in')
    token_vec = embedding.weight[token_id]
    cosine = CosineSimilarity(dim=0)
    sims = Counter()
 
    for index, token in vocab.get_index_to_token_vocabulary('token_in').items():
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim
 
    return sims.most_common(num_synonyms)


In [19]:
get_related('one', embedding_in, vocab)

[('one', 1.0),
 ('skewed', 0.24608801305294037),
 ('jacque', 0.24332156777381897),
 ('decennia', 0.24042947590351105),
 ('fodder', 0.23265540599822998),
 ('memorandum', 0.22695478796958923),
 ('chibia', 0.22599883377552032),
 ('freeways', 0.22321894764900208),
 ('authority', 0.2220715880393982),
 ('smallest', 0.22094282507896423)]

In [18]:
get_related('chess', embedding_in, vocab)

[('chess', 1.0),
 ('crudus', 0.2711144685745239),
 ('freemasonry', 0.26305800676345825),
 ('pitted', 0.2609158754348755),
 ('directories', 0.23276613652706146),
 ('archives', 0.23150384426116943),
 ('kirillovich', 0.22892335057258606),
 ('takeji', 0.2265753597021103),
 ('junta', 0.22534406185150146),
 ('pluto', 0.2240152657032013)]

In [20]:
get_related('football', embedding_in, vocab)

[('football', 1.0),
 ('ranged', 0.3013802170753479),
 ('irrational', 0.24819530546665192),
 ('sabine', 0.2465231567621231),
 ('countrymen', 0.23291805386543274),
 ('gangadhar', 0.22183726727962494),
 ('petrie', 0.22163599729537964),
 ('scorn', 0.21885591745376587),
 ('specialised', 0.21760006248950958),
 ('hello', 0.21522819995880127)]

Despite what it says in the book, these results seem terrible to me! <br>
There are improvents  that can be applied, but the text goes on to explain that GloVe is better on the whole so probably not worth investigating.