In [1]:
import glob,os
import sys
import pandas as pd
import string
import numpy as np
from tqdm import tqdm
from time import sleep
import sys
from gensim.corpora import WikiCorpus
from multiprocessing import Pool
import json
import pickle
from collections import defaultdict
import gc
import torch
from scipy import sparse
from scipy.sparse import dok_matrix

In [2]:
class Vocab:
    PAD_TOKEN = '<pad>'

    def __init__(self):
        self.w2i = {}
        self.i2w = []
        self.support = []
        self.add_token(Vocab.PAD_TOKEN)
        self.cached_neg_sample_prob = None

    def pad_id(self):
        return self.get_id(Vocab.PAD_TOKEN)

    def add_tokens(self, tokens):
        for tidx, token in enumerate(tokens):
            self.add_token(token)

    def add_token(self, token, token_support=1):
        if token not in self.w2i:
            self.w2i[token] = len(self.i2w)
            self.i2w.append(token)
            self.support.append(0)
        self.support[self.get_id(token)] += token_support

    def neg_sample(self, size=None):
        if self.cached_neg_sample_prob is None:
            support = np.array(self.support)
            support_raised = np.power(support, 0.75)
            support_raised[0] = 0.0  # Never select padding idx
            self.cached_neg_sample_prob = support_raised / support_raised.sum()
        return np.random.choice(np.arange(self.size()), size=size, p=self.cached_neg_sample_prob)

    def get_id(self, token):
        if token in self.w2i:
            return self.w2i[token]
        return -1

    def id_count(self, id):
        return self.support[id]

    def token_count(self, token):
        return self.id_count(self.get_id(token))

    def get_ids(self, tokens):
        return list(map(self.get_id, tokens))

    def get_token(self, id):
        return self.i2w[id]

    def size(self):
        return len(self.i2w)

In [6]:
class extract_data:

    def __init__(self,data_path,path,keep_prob):
        self.X = []
        self.vocab = []
        self.tokens = []
        self.keep_prob = []
        self.subsampled_ids = []
        self.tokenized_subsampled_data = []
        self.data_path = data_path
        self.path = path
        self.keep_prob = keep_prob
        
    def make_corpus(self,in_f, out_f):
        print("Loading data")
        wiki = WikiCorpus(in_f)
        print("Processing...")
        i = 1
        for document in wiki.get_texts():
            out_f_ = ''.join([out_f,'\\wiki_en_',str(i),".txt"])
            output = open(out_f_, 'w',encoding='utf-8')
            output.write(' '.join(document))
            output.close()
            i+=1
            if i%10000 == 0:  
                print(i,"documents procecessed")
        print('Processing complete!')

    def read_data(self):
            X = []
            keep_prob = self.keep_prob
            print("Reading txt data")
            os.chdir(self.data_path)
            N = len(glob.glob("*.txt"))
            #choice = np.random.choice(N, sample_size, replace=False)+1
            for i in tqdm(range(1,N+1), position=0, leave=True):
                if np.random.binomial(1, keep_prob):
                    text = ''.join([self.data_path,"\\wiki_en_",str(i),".txt"])
                    with open(text,'r',encoding='utf-8') as f:
                        X.append(''.join(list(f)).lower())
                else:
                    pass
            print("Saving sampled wiki data, sample rate:",keep_prob)
            with open(self.path + 'Wiki_tokenized{}.json'.format(''), 'w') as fd:
                json.dump(list(zip(range(len(X)),X)), fd)
    
    def token_counts(self):
        print("Generating token counts")
        token_cts = defaultdict(int)
        doc_id = 0
        token_counts_fn = self.path + 'Wiki_tokenized{}.json'.format('')
        with open(token_counts_fn, 'r') as fd:
            X = json.load(fd)
        for doc in X:
            for token in doc[1].split():
                token_cts[token] += 1
                token_cts['__ALL__'] += 1
        print("Saving token counts")
        with open(self.path + 'Wiki_token_counts{}.json'.format(''), 'w') as fd:
            json.dump(token_cts, fd)
    
    def subsample(self):
        print("Sub-sampling tokens...")
        tokenized_fp = self.path + 'Wiki_tokenized'
        token_counts_fp = self.path + 'Wiki_token_counts'
        subsample_param = 0.001
        min_token_count = 5
        debug_str = ''
        tokenized_data_fn = '{}{}.json'.format(tokenized_fp, debug_str)
        with open(tokenized_data_fn, 'r') as fd:
            tokenized_data = json.load(fd)
        token_counts_fn = '{}{}.json'.format(token_counts_fp, debug_str)
        with open(token_counts_fn, 'r') as fd:
            token_counts = json.load(fd)
        N = float(token_counts['__ALL__'])
        # And vocabulary with word counts
        self.vocab = Vocab()
        num_docs = len(tokenized_data)
        for doc_idx in tqdm(range(num_docs), position=0, leave=True):
            category, tokenized_doc_str = tokenized_data[doc_idx]
            subsampled_doc = []
            for token in tokenized_doc_str.split():
                wc = token_counts[token]
                too_sparse = wc <= min_token_count
                if too_sparse:
                    continue
                frac = wc / N
                keep_prob = min((np.sqrt(frac / subsample_param) + 1) * (subsample_param / frac), 1.0)
                should_keep = np.random.binomial(1, keep_prob) == 1
                if should_keep:
                    subsampled_doc.append(token)
                    self.vocab.add_token(token, token_support=1)
            self.tokenized_subsampled_data.append((category-1, ' '.join(subsampled_doc)))
            
    def tokens_to_ids(self):
        print("Converting tokens to ids...")
        for i in tqdm(range(len(self.tokenized_subsampled_data)), position=0, leave=True):
            self.subsampled_ids.append(np.asarray([self.vocab.get_id(x) for x in self.tokenized_subsampled_data[i][1].split()]))
        self.subsampled_ids = np.asarray(self.subsampled_ids)
    
    def token_doc_map(self):
        print("Forming token document matrix... ")
        self.token_doc_matrix = dok_matrix((self.vocab.size(),self.subsampled_ids.shape[0]), dtype=np.int16)
        for i in tqdm(range(self.subsampled_ids.shape[0]), position=0, leave=True):
            for token_id in self.subsampled_ids[i]:
                self.token_doc_matrix[token_id,i] +=1

In [7]:
in_f = "D:\Latent Meaning Cells\simplewiki-latest-pages-articles.xml.bz2"
out_f = 'D:\Latent Meaning Cells\simplewiki'

data_path =  "D:\\Latent Meaning Cells\\simplewiki" #path where you read txt files
path = "D:\\Latent Meaning Cells\\" #path where you output json files

wiki_data = extract_data(data_path,path,keep_prob = 1)
#wiki_data.make_corpus(in_f, out_f)
try:
    p = Pool(processes=10)
    p.apply(wiki_data.read_data()) #run if you want to read txt files
except:
    p.close()
    wiki_data.token_counts() #if you already have json files start from here
    wiki_data.subsample()
    wiki_data.tokens_to_ids()
    wiki_data.token_doc_map()

Reading txt data


100%|████████████████████████████████████████████████████████████████████████| 100527/100527 [00:51<00:00, 1942.64it/s]


Saving sampled wiki data, sample rate: 1
Generating token counts
Saving token counts
Sub-sampling tokens...


100%|█████████████████████████████████████████████████████████████████████████| 100527/100527 [02:32<00:00, 660.44it/s]
  0%|▎                                                                          | 404/100527 [00:00<00:24, 4011.08it/s]

Converting tokens to ids...


100%|████████████████████████████████████████████████████████████████████████| 100527/100527 [00:10<00:00, 9495.61it/s]
  0%|                                                                             | 3/100527 [00:00<1:01:17, 27.34it/s]

Forming token document matrix... 


100%|█████████████████████████████████████████████████████████████████████████| 100527/100527 [11:05<00:00, 150.97it/s]


In [8]:
sub_tokens = wiki_data.subsampled_ids
vocab = wiki_data.vocab
token_doc_matrix = wiki_data.token_doc_matrix

In [9]:
#filename = "D:\Latent Meaning Cells\\vocab.obj" 
#file_pi = open(filename, 'wb')
#pickle.dump(vocab, file_pi)

If already extracted:

In [10]:
#filename = "D:\Latent Meaning Cells\sub_tokens.obj"
#filehandler = open(filename, 'rb')
#sub_tokens = pickle.load(filehandler)

In [11]:
#filename = "D:\Latent Meaning Cells\\vocab.obj"
#filehandler = open(filename, 'rb')
#vocab = pickle.load(filehandler)

In [12]:
class batcher:

    def __init__(self,batch_size,window_size,vocab,sub_tokens):

        self.batch_size = batch_size
        self.window_size = window_size
        self.vocab = vocab
        self.sub_tokens = sub_tokens
        
        self.vocab_tokens = np.linspace(1, vocab.size()-1, num=vocab.size()-1).astype(int)
        #self.prob = np.power(vocab.support[1:], 0.75)
        #self.prob = self.prob/np.sum(self.prob)
        
    def next(self):
        
        sub_tokens = self.sub_tokens
        batch_size = self.batch_size
        window_size = self.window_size
        #prob = self.prob
        
        center_words = np.zeros(batch_size)
        vocab_tokens =self.vocab_tokens
        num_contexts = np.zeros(batch_size)
        
        positive_words = np.zeros((batch_size,window_size*2))
        negative_words = np.zeros((batch_size,window_size*2))
        doc_ids = np.random.choice(len(sub_tokens),batch_size)
        len_docs = np.asarray([x.shape[0] for x in sub_tokens[doc_ids]])
        center_index = np.asarray([np.random.choice(x) for x in len_docs])
        upper_index = np.minimum(center_index+window_size,len_docs-1).astype(int)
        lower_index = np.maximum(center_index-window_size,np.zeros(batch_size)).astype(int)
            
        for i in range(batch_size):
        
            positive_sub_batch = np.linspace(lower_index[i],upper_index[i], num=upper_index[i]-lower_index[i]+1)
            positive_sub_batch = positive_sub_batch[positive_sub_batch != center_index[i]].astype(int)
            
            num_contexts[i] = positive_sub_batch.shape[0]
            
            document = sub_tokens[doc_ids[i]]
            positive_sub_batch = np.asarray([document[x] for x in positive_sub_batch]).astype(int)
            positive_words[i,:positive_sub_batch.shape[0]] = positive_sub_batch

            center_words[i] = document[center_index[i]]
            
            #negative_words_ = vocab_tokens[~np.isin(vocab_tokens, positive_sub_batch)]
            #negative_sampling_probability = prob[~np.isin(vocab_tokens, positive_sub_batch)]
            #negative_sampling_probability = negative_sampling_probability/np.sum(negative_sampling_probability)
            #negative_words[i] = np.random.choice(negative_words_, window_size*2, p=negative_sampling_probability).astype(int)
            
            negative_words[i] = vocab.neg_sample(window_size*2)
            
        return doc_ids.astype(int), center_words.astype(int), positive_words.astype(int), negative_words.astype(int),num_contexts.astype(int)

In [13]:
def mask_2D(target_size, num_contexts):
    mask = torch.BoolTensor(target_size)
    mask.fill_(0)
    for batch_idx, num_c in enumerate(num_contexts):
        if num_c < target_size[1]:
            mask[batch_idx, num_c:] = 1
    return mask

In [14]:
import numpy as np
import torch
import torch.nn as nn


class VAE(nn.Module):
    def __init__(self, device,encoder_input_dim,encoder_hidden_dim,latent_dim, token_vocab_size, section_vocab_size):
        super(VAE, self).__init__()
        self.device = device
        self.encoder = Encoder(encoder_input_dim,encoder_hidden_dim,latent_dim, token_vocab_size, section_vocab_size)
        self.margin = 1.0

    def forward(self, center_ids, section_ids, context_ids, neg_context_ids,num_context_ids):
        """
        :param center_ids: batch_size
        :param section_ids: batch_size
        :param context_ids: batch_size, 2 * context_window
        :param neg_context_ids: batch_size, 2 * context_window
        :param num_contexts: batch_size (how many context words for each center id - necessary for masking padding)
        :return: cost components: KL-Divergence (q(z|w,c) || p(z|w)) and max margin (reconstruction error)
        """
        # Mask padded context ids
        batch_size, num_context_ids = context_ids.size()
        mask_size = torch.Size([batch_size, num_context_ids])
        mask = mask_2D(mask_size, num_contexts).to(self.device)

        # Compute center words
        mu_center, sigma_center = self.encoder(center_ids, section_ids)
        mu_center_tiled = mu_center.unsqueeze(1).repeat(1, num_context_ids, 1)
        sigma_center_tiled = sigma_center.unsqueeze(1).repeat(1, num_context_ids, 1)
        mu_center_flat = mu_center_tiled.view(batch_size * num_context_ids, -1)
        sigma_center_flat = sigma_center_tiled.view(batch_size * num_context_ids, -1)

        # Tile section ids for positive and negative samples
        section_ids_tiled = section_ids.unsqueeze(-1).repeat(1, num_context_ids)

        # Compute positive and negative encoded samples
        mu_pos_context, sigma_pos_context = self.encoder(context_ids, section_ids_tiled)
        mu_neg_context, sigma_neg_context = self.encoder(neg_context_ids, section_ids_tiled)
        
        # Flatten positive context
        mu_pos_context_flat = mu_pos_context.view(batch_size * num_context_ids, -1)
        sigma_pos_context_flat = sigma_pos_context.view(batch_size * num_context_ids, -1)

        # Flatten negative context
        mu_neg_context_flat = mu_neg_context.view(batch_size * num_context_ids, -1)
        sigma_neg_context_flat = sigma_neg_context.view(batch_size * num_context_ids, -1)

        # Compute KL-divergence between center words and negative and reshape
        kl_pos_flat = compute_kl(mu_center_flat, sigma_center_flat, mu_pos_context_flat, sigma_pos_context_flat)
        kl_neg_flat = compute_kl(mu_center_flat, sigma_center_flat, mu_neg_context_flat, sigma_neg_context_flat)
        kl_pos = kl_pos_flat.view(batch_size, num_context_ids)
        kl_neg = kl_neg_flat.view(batch_size, num_context_ids)

        hinge_loss = (kl_pos - kl_neg + self.margin).clamp_min_(0)
        hinge_loss = hinge_loss.masked_fill(mask, 0)
        hinge_loss = hinge_loss.sum(1)
        return hinge_loss.mean()

In [15]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import torch.utils.data
 
class Encoder(nn.Module):
    def __init__(self, encoder_input_dim,encoder_hidden_dim,latent_dim, token_vocab_size, section_vocab_size):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(0.2)
        self.f = nn.Linear(encoder_input_dim * 2, encoder_hidden_dim, bias=True)
        self.u = nn.Linear(encoder_hidden_dim, latent_dim, bias=True)
        self.v = nn.Linear(encoder_hidden_dim, 1, bias=True)
    
        self.token_embeddings = nn.Embedding(token_vocab_size, encoder_input_dim, padding_idx=0)
        self.token_embeddings.weight.data.uniform_(-2, 2)
        self.section_embeddings = nn.Embedding(section_vocab_size, encoder_input_dim)
        self.section_embeddings.weight.data.uniform_(-2, 2)
        
    def forward(self, center_ids, section_ids):
        """
        :param center_ids: LongTensor of batch_size
        :param context_ids: LongTensor of batch_size
        :param mask: BoolTensor of batch_size x 2 * context_window (which context_ids are just the padding idx)
        :return: mu (batch_size, latent_dim), logvar (batch_size, 1)
        """
        center_embedding = self.token_embeddings(center_ids)
        section_embedding = self.section_embeddings(section_ids)
            
        merged_embeds = self.dropout(torch.cat([center_embedding, section_embedding], dim=-1))
            
        h = self.dropout(F.relu(self.f(merged_embeds)))
        var_clamped = self.v(h).exp().clamp_min(1.0)
        return self.u(h), var_clamped

In [16]:
def compute_kl(mu_a, sigma_a, mu_b, sigma_b, device=None):
    """
    :param mu_a: mean vector of batch_size x dim
    :param sigma_a: standard deviation of batch_size x {1, dim}
    :param mu_b: mean vector of batch_size x dim
    :param sigma_b: standard deviation of batch_size x {1, dim}
    :return: computes KL-Divergence between 2 diagonal Gaussian (a||b)
    """
    var_dim = sigma_a.size()[-1]
    assert sigma_b.size()[-1] == var_dim
    if var_dim == 1:
        return kl_spher(mu_a, sigma_a, mu_b, sigma_b)
    return kl_diag(mu_a, sigma_a, mu_b, sigma_b, device=device)

In [17]:
def kl_spher(mu_a, sigma_a, mu_b, sigma_b):
    """
    :param mu_a: mean vector of batch_size x dim
    :param sigma_a: standard deviation of batch_size x 1
    :param mu_b: mean vector of batch_size x dim
    :param sigma_b: standard deviation of batch_size x 1
    :return: computes KL-Divergence between 2 spherical Gaussian (a||b)
    """
    d = mu_a.shape[1]
    sigma_p_inv = 1.0 / sigma_b  # because diagonal
    tra = d * sigma_a * sigma_p_inv
    quadr = sigma_p_inv * torch.pow(mu_b - mu_a, 2).sum(1, keepdim=True)
    log_det = - d * torch.log(sigma_a * sigma_p_inv)
    res = 0.5 * (tra + quadr - d + log_det)
    return res

In [18]:
device="cuda"
encoder_input_dim = 100
encoder_hidden_dim = 64
latent_dim = 100

token_vocab_size = vocab.size()
section_vocab_size = sub_tokens.shape[0]

model = VAE(device,encoder_input_dim,encoder_hidden_dim,latent_dim, token_vocab_size, section_vocab_size).to(device)

trainable_params = filter(lambda x: x.requires_grad, model.parameters())
optimizer = torch.optim.Adam(trainable_params, lr=0.01)
optimizer.zero_grad()

In [19]:
model.load_state_dict(torch.load( "D:\\Latent Meaning Cells\\checkpoint.pth"))

<All keys matched successfully>

In [57]:
window_size = 5
batch_size = 4096*2
num_epoch = 300
num_contexts = batch_size
generator = batcher(batch_size,window_size,vocab,sub_tokens)


for epoch in range(1, num_epoch + 1):
    sleep(0.1)  # Make sure logging is synchronous with tqdm progress bar
    print('Starting Epoch={}'.format(epoch))
    generator = batcher(batch_size,window_size,vocab,sub_tokens)
    num_batches = batch_size
    loss_array = []
    epoc_loss = 0
    for _ in tqdm(range(int(vocab.size()/num_batches)), position=0, leave=True):
        # Reset gradients
        optimizer.zero_grad()

        section_ids,center_ids, context_ids, neg_ids,num_contexts = generator.next()
        
        center_ids_tens = torch.LongTensor(center_ids).to(device)
        context_ids_tens = torch.LongTensor(context_ids).to(device)
        section_ids_tens = torch.LongTensor(section_ids).to(device)
        neg_ids_tens = torch.LongTensor(neg_ids).to(device)

        loss = model(center_ids_tens, section_ids_tens, context_ids_tens, neg_ids_tens,num_contexts)
        loss.backward()  # backpropagate loss
        epoc_loss += loss.item()
    
        

        optimizer.step()
        
    loss_array.append(epoc_loss*num_batches/vocab.size())
    if loss_array[-1]<=min(loss_array):
        print("Saving...")
        torch.save(model.state_dict(), "D:\\Latent Meaning Cells\\checkpoint.pth")
        
    sleep(0.1)
    print('Epoch={}. Loss={}.'.format(epoch, loss_array[-1]))

  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Starting Epoch=1


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [02:26<00:00, 11.31s/it]


Saving...


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Epoch=1. Loss=3.006491052982453.
Starting Epoch=2


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [02:26<00:00, 11.24s/it]


Saving...


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Epoch=2. Loss=2.9857675854485968.
Starting Epoch=3


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [02:25<00:00, 11.23s/it]


Saving...


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Epoch=3. Loss=2.9779395672451217.
Starting Epoch=4


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [02:29<00:00, 11.50s/it]


Saving...


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Epoch=4. Loss=2.9667057744355.
Starting Epoch=5


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [02:21<00:00, 10.87s/it]


Saving...


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Epoch=5. Loss=2.9701183571423697.
Starting Epoch=6


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [02:27<00:00, 11.33s/it]


Saving...


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Epoch=6. Loss=2.96311836752339.
Starting Epoch=7


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [02:24<00:00, 11.15s/it]


Saving...


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Epoch=7. Loss=2.967714795400042.
Starting Epoch=8


  8%|██████▍                                                                            | 1/13 [00:20<04:06, 20.51s/it]


KeyboardInterrupt: 

In [100]:
vocab.get_id("women")

204

In [20]:
document_embeddings = np.zeros((sub_tokens.shape[0],100))
for i in range(sub_tokens.shape[0]):
    document_embeddings[i] = model.encoder.section_embeddings(torch.Tensor(np.asarray([i])).long().to("cuda")).data.to("cpu").numpy()

In [79]:
doc = document_embeddings[0]
doc = doc/np.sqrt(np.sum(np.dot(doc,doc)))
normalized = np.divide(document_embeddings,np.sqrt(np.sum(np.multiply(document_embeddings,document_embeddings),axis = 1)).reshape(-1,1))
dist = np.matmul(normalized,doc)

In [80]:
np.argsort(-dist)[:50]

array([    0,   337,   193, 78440,     1, 51418,    92,   229, 43836,
       57940,   265, 11100, 51831, 80349, 57861, 15042,  7742, 43432,
       72104,  4787,  5698, 25170,  5837,  5523, 53062,   281, 57939,
       15048, 16437,  6535,  5898, 97910, 57880,  6169, 57903, 56852,
       57937, 24100, 78234, 55724, 47547, 57925,  9370, 51566,  5694,
       57879, 15084, 57875,  4722,   131], dtype=int64)

## Similar articles, April

In [81]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[0]])) 

april fourth month the year and comes between march may one of four months have days april always begins on same day week july and additionally january leap years april always ends same day of week as december april flowers are sweet pea daisy its birthstone diamond meaning diamond is innocence month spring flowers april in northern hemisphere april comes between march and may making it fourth month of year comes first year out of four months that have days as june september november later year april begins on same day week as july every year on same day week as january leap years april ends same day week as december every year as each other last days are exactly weeks days apart common years april starts on the same day week as october of previous year in leap years may of previous year common years april finishes same day week july previous year leap years february and october previous year common years immediately after other common years april starts same day week as january of pre

In [82]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[337]])) 

september sep is ninth month of year gregorian calendar coming between august october it has days its name comes from the latin word sept seven it the seventh month year before january february were added beginning year september always begins same day of week as december but never ends on same day the week any other month month works progress administration poster from september the old roman calendar september was seventh month which where it got its name means seventh the ninth month at time november means ninth julius caesar calendar reform september became ninth month days september comes after august before october september begins same day week as december every year as each other first days are weeks days apart no other month any year common or leap year ends same day week september common years september starts on same day week april july the previous year leap years october previous year in common years september finishes same day week as april december the previous year and 

In [83]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[193]])) 

january jan first month year julian gregorian calendars coming between december previous year february of current year has days january begins same day the week as october common years april july leap years january ends same day week february october in common years july in leap years month snow in january northern hemisphere where winter month january named for janus roman god doors and gates january february were put on calendar after all other months this is because the original roman calendar winter did not have months although march was originally first month january became new first month because was when people chose new consuls roman leaders month has days january winter month northern hemisphere and summer month the southern hemisphere each hemisphere it is seasonal equivalent of july the perihelion point its orbit where earth is closest sun also occurs this month between january january january begins on same day week as october common years on same day the week as april july

## Similar articles, Footbal

In [62]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[9999]])) 

nürnberg is football club nuremberg germany it was founded may manager are andreas sports michael finance coach michael title bundesliga winners the dfb pokal league position season league position bundesliga champions bundesliga th bundesliga th bundesliga champions bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th bundesliga th former position famous players the club max morlock heinrich andreas köpke stefan reuter norbert eder dieter eckstein georg ferdinand heinz franz uli marek tomáš ivan robert vittek jan koller angelos charisteas david stefan kießling hiroshi coaches herbert jeno gunter baumann jeno max merkel robert körner kuno thomas barthel slobodan fritz zlatko hans horst werner kern robert gebhardt jeff robert gebhardt horst fritz popp fred hoffmann heinz udo klug rudi fritz popp heinz hermann gerland dieter arie haan willi dieter renner rainer günter herman

In [68]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[81252]])) 

in blue qualified play at euro these are twenty four national teams who played euro in france every national association must present before uefa list footballers which three of them must be goalkeepers ten days before first match that means th may footballer can be replaced another one if he is injured or sick although must be certified by uefa medical committee medical staff his football club these dispositions applied according art euro regulations teams group albany nationalteam goalkeeper defender midfield forward coach albany hoxha andi armando gianni de france nationalteam goalkeeper defender midfield forward coach france hugo christophe umtiti golo sissoko antoine pierre didier deschamps romania nationalteam goalkeeper defender midfield forward coach romania lung jr alexandru ovidiu popa denis stancu anghel switzerland nationalteam goalkeeper defender midfield forward coach switzerland yann stephan von schär fabian shaqiri breel vladimir petković group england nationalteam goal

In [65]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[34080]])) 

admira football club which plays austria admira wacker mödling formed from merger two football clubs admira wacker wien vfb mödling in admira wacker wien formed from merger two football clubs admira wien wacker wien name vfb admira wacker mödling present fc admira team left first row michael horvath markus patrick hans peter berger thomas benjamin günter christoph second row alexander friedl hans werner weiss georg georg goalkeeper coach ivan bernhard morgenthaler christopher captain team markus daniel drescher manuel paul gernot rene armin schiller manager richard chairman third row dietmar kühbauer rene max sax daniel toth richard ken noel bernhard schachner stephan palla patrik manfred assistance coach goalkeeper patrick hans peter berger thomas defence richard stephan palla gernot daniel drescher markus bernhard morgenthaler markus christopher michael horvath midfield paul patrik manu daniel wolf rene christoph ivan stefan schwab bernhard schachner daniel toth forward marcus benjam

## Similar articles weapons:

In [53]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[5192]])) 

nuclear missiles are missiles rockets that land earth or hit something air water or whatever with atomic bombs inside time cold war usa soviet union were enemies thought other side was going attack them to dissuade other side attacking both sides built many nuclear missiles to ensure mutual assured destruction this meant country that attacked would be destroyed treaties were made reduce the numbers missiles to make everyone safer


In [74]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[46029]]))

bushmaster acr the adaptive combat rifle acr is fully automatic assault rifle it designed magpul industries erie colorado late january bushmaster made licensing agreement magpul this agreement bushmaster would take over making development sales the acr the rifle developed over five months was supposed replace rifle completely it also supposed to do without money government prototypes were shown the shot show orlando florida it was supposed be released some time however bushmaster said may that acr would not be released to public until this because bushmaster were focusing more military projects november bushmaster released statement saying acr being redesigned be superior offering compete for next generation us army infantry carbine subcompact weapon requirement will be available to select customers acr one weapons shown army during an industry day on november this industry day happened so that modern carbines could be looked at before it decided what weapon that would replace carbine 

In [77]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[2141]]))

world war ii submarine uss sea owl submarine or sub vessel that goes underwater most large submarines war vessels some small ones used scientific or business purposes these are often called cannot go far or stay long away base some people buy their own explore under the sea submarine actually boat not ship history early submarines were often powered by hand this because boat engines had not been invented yet they were almost always made war submarines would try sink enemy ships crude methods these included drilling screws into their wooden hulls some tried blow up ship this often destroyed submarine too at end the th century whitehead torpedo electrical systems allowed much better submarines these better submarines were much used during first second world wars many these better submarines were created germany called boats submarines were powered by diesel electric system diesel motor would be used turn generator generator would charge large battery while submarine surface power battery

## Similar articles, Cities/towns of Germany

In [84]:
doc = document_embeddings[7889]
doc = doc/np.sqrt(np.sum(np.dot(doc,doc)))
normalized = np.divide(document_embeddings,np.sqrt(np.sum(np.multiply(document_embeddings,document_embeddings),axis = 1)).reshape(-1,1))
dist = np.matmul(normalized,doc)
np.argsort(-dist)[:50]

array([ 7889, 91626, 67366, 33432,  9092, 67530, 52788, 91283, 21207,
       20206,  4143, 94223, 83212,  3540, 18059, 19851, 91274, 21036,
       82880, 95643, 25365, 54547, 13134, 19940, 70292, 65995, 17897,
        7282, 17898, 86015, 25640, 18927, 97092, 80547, 17893, 84478,
       94987, 98830, 87368, 17895, 17201, 80206, 74150, 24563, 97230,
       19205, 80735, 76798, 95529, 17969], dtype=int64)

In [85]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[7889]]))

frankfurt oder or frankfurt an der oder is town in germany lies on oder river which marks current border between germany poland so called oder neisse line it east state brandenburg since january town added prefix refer heinrich von kleist who was born there today about people are living frankfurt an der oder well known people manuela schwesig born politician references


In [88]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[91626]]))

rüti can refer rüti glarus rüti zürich rüti bei büren berne rüti bei lyssach berne rüti bei riggisberg berne names the following places have the same origin municipality appenzell innerrhoden municipality st gallen municipality solothurn


In [89]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[33432]]))

dillingen district rural district swabia southwest bavaria germany neighbor districts are donau ries augsburg district günzburg district and baden württemberg heidenheim district dillingen capital dillingen district towns municipalities district landkreis dillingen numbers in map see also opposite list towns municipalities dillingen der donau an der donau an der donau bissingen schwenningen


In [90]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[9092]]))

weißenfels capital city old weißenfels rural district in saxony anhalt germany until district changes king saxony lived here at one time it is about km southwest leipzig population growth in number people over years data source from statistic national office saxony anhalt october august october references other websites


In [91]:
print(' '.join([vocab.get_token(x) for x in sub_tokens[67530]]))

state as shown map the deutsches reich the arms grand duchy grand duchy of oldenburg state europe it in area today is germany it created out smaller duchy of oldenburg the state created at congress vienna but ceased exist after german revolution in capital state oldenburg grand ducal family lived schloss oldenburg in capital


**Word embeddings:**

In [21]:
word_embeddings = np.zeros((vocab.size(),100))
for i in range(vocab.size()):
    word_embeddings[i] = model.encoder.token_embeddings(torch.Tensor(np.asarray([i])).long().to("cuda")).data.to("cpu").numpy()

In [69]:
word =  word_embeddings[1]
dist = word_embeddings - word.reshape(1,-1)
dist = np.multiply(dist,dist)
dist = np.sqrt(np.sum(dist,axis = 1))

In [None]:
for x in np.argsort(dist)[:50]:
    print(vocab.get_token(x))

In [70]:
word = word_embeddings[1]
word = word/np.sqrt(np.sum(np.dot(word,word)))
normalized = np.divide(word_embeddings,np.sqrt(np.sum(np.multiply(word_embeddings,word_embeddings),axis = 1)).reshape(-1,1))
dist = np.matmul(normalized,word)

In [71]:
for x in np.argsort(-dist)[:50]:
    print(vocab.get_token(x))

april
september
march
october
february
august
flowers
week
november
june
additionally
born
former
was
half
scientist
age
member
pancreatic
career
references
websites
director
received
central
she
kon
retired
land
worked
until
professionally
as
biography
complications
won
body
acted
award
actor
known
january
outlived
it
to
profile
nominated
started
best
film


**Expected Meaning: Test Zone**

In [48]:
wiki_data.token_doc_matrix[1].todense()

matrix([[228,   0,   0, ...,   0,   0,   0]], dtype=int16)

In [61]:
sparse.diags(1/token_doc_matrix.sum(axis=1).A.ravel())

  """Entry point for launching an IPython kernel.


<109876x109876 sparse matrix of type '<class 'numpy.float64'>'
	with 109876 stored elements (1 diagonals) in DIAgonal format>

In [24]:
token_doc_matrix = sparse.diags(1/token_doc_matrix.sum(axis=1).A.ravel()).dot(token_doc_matrix)

  """Entry point for launching an IPython kernel.


In [55]:
token_doc_matrix[3,0]

0.000552791597567717

In [58]:
token_doc_matrix[(1,0)]

0.010929485643066008

In [15]:
token_doc_matrix = dok_matrix(token_doc_matrix)

In [16]:
word_meaning = np.zeros((token_doc_matrix.shape[0],100))
for key in tqdm(token_doc_matrix.keys()):
    word_meaning[key[0]] += token_doc_matrix[key]*model.eval().encoder.forward(torch.Tensor(np.asarray([int(key[0])])).long().to("cuda"),torch.Tensor(np.asarray([int(key[1])])).long().to("cuda"))[0].data.to("cpu").numpy()[0]

100%|███████████████████████████████████████████████████████████████████| 11710802/11710802 [5:08:13<00:00, 633.24it/s]


In [147]:
vocab.get_id("anarchism")

10661

In [154]:
[vocab.get_token(x) for x in sub_tokens[0]]

['april',
 'fourth',
 'month',
 'year',
 'and',
 'comes',
 'between',
 'march',
 'and',
 'may',
 'it',
 'one',
 'four',
 'months',
 'have',
 'days',
 'april',
 'always',
 'begins',
 'same',
 'day',
 'week',
 'as',
 'july',
 'additionally',
 'january',
 'in',
 'leap',
 'years',
 'april',
 'always',
 'ends',
 'same',
 'day',
 'of',
 'week',
 'as',
 'december',
 'april',
 'flowers',
 'are',
 'sweet',
 'pea',
 'daisy',
 'its',
 'birthstone',
 'diamond',
 'meaning',
 'diamond',
 'innocence',
 'month',
 'spring',
 'flowers',
 'april',
 'northern',
 'hemisphere',
 'april',
 'comes',
 'between',
 'march',
 'and',
 'may',
 'making',
 'it',
 'fourth',
 'month',
 'year',
 'also',
 'comes',
 'first',
 'the',
 'year',
 'out',
 'four',
 'months',
 'that',
 'have',
 'days',
 'june',
 'september',
 'november',
 'are',
 'later',
 'year',
 'april',
 'begins',
 'on',
 'same',
 'day',
 'week',
 'july',
 'every',
 'year',
 'same',
 'day',
 'week',
 'january',
 'leap',
 'years',
 'april',
 'ends',
 'same',


In [148]:
np.argsort(token_doc_matrix[10661].todense())

matrix([[    0, 67017, 67016, ..., 12819, 48338, 94804]], dtype=int64)

In [165]:
word = model.eval().encoder.forward(torch.Tensor(np.asarray([1])).long().to("cuda"),torch.Tensor(np.asarray([0])).long().to("cuda"))[0].data.to("cpu").numpy()[0]
word = word/np.sqrt(np.sum(np.dot(word,word)))
normalized = np.divide(word_meaning,np.sqrt(np.sum(np.multiply(word_meaning,word_meaning),axis = 1)).reshape(-1,1))
dist = np.matmul(normalized,word)

  This is separate from the ipykernel package so we can avoid doing imports until


In [166]:
for x in np.argsort(-dist)[:50]:
    print(vocab.get_token(x))

professionally
underdog
samantha
sportswriters
narcos
kardashian
shocker
weppes
glima
congestive
ría
diski
nominations
appearances
rafinha
jon
role
paige
ostrevent
ejaculating
lung
unforgettable
sandler
tshabalala
datagrams
avn
wayans
sexay
goldblum
stroke
illness
maxine
kris
courtney
achtung
rubin
fleance
psb
sundre
donny
diagnosed
moir
janner
wilhelmus
gordy
edvard
djawadi
kateřina
wynonna
liza
