In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as fn
from torch.autograd import Variable
import numpy as np

In [4]:
# Multi-Component word embeddings

class MultiComp(nn.Module):
    def __init__(self, word_embeddings_to_comp, n_comp=3):
        super(MultiComp, self).__init__()
        self.n_comp = n_comp
        n_words, embedding_size = word_embeddings_to_comp.shape
        # expand to duplicate embeddings n_comp times
        expanded_word_embeddings = np.expand_dims(word_embeddings_to_comp, 1)
        comp_embeddings = np.tile(expanded_word_embeddings, [1, n_comp, 1])
        
        # add some noise to components (1/3 of its standard deviations)
        stds = word_embeddings_to_comp.std(axis=1, keepdims=True).reshape([n_words, 1, 1])
        comp_embeddings += np.random.randn(n_words, n_comp, embedding_size)  * stds  / 3
        comp_embeddings = comp_embeddings.astype(np.float32)
        
        # create variable to use autograd
        self.words_comps = nn.Parameter(torch.from_numpy(comp_embeddings))
        
        # weight matrices for attention (times 2 because concat context and comp)
        weights = np.random.randn(n_words, embedding_size * 2, 1)
        # xavier
        weights = weights * np.sqrt(2 / (embedding_size + n_comp))
        weights = weights.astype(np.float32)
        self.att_w = nn.Parameter(torch.from_numpy(weights))
        self.att_b = nn.Parameter(torch.zeros(n_words, n_comp, 1))
                
    def forward(self, context_embeddigs, word_n):
        # pick word_embeddings and linear layer weigts
        w_comps = self.words_comps[word_n]
        att_w = self.att_w[word_n]
        att_b = self.att_b[word_n]
        # sum the context across words dim 
        cont_sum = torch.mean(context_embeddigs, 0, keepdim=True)
        cont_sum_repeated = cont_sum.repeat(self.n_comp, 1)
        att_input = torch.cat([cont_sum_repeated, w_comps], dim=1)
        att = torch.matmul(att_input, att_w) + att_b
        comps_sum = torch.sum(w_comps * att, 0)
        dot_prod = torch.matmul(comps_sum, cont_sum.squeeze())
        return dot_prod

# Test simple

In [5]:
w_emb = np.random.randn(3, 100)
net = MultiComp(w_emb)
opt = torch.optim.Adam(net.parameters(), lr=1e-3)


n_samples = 10
n_context = 20 # number of words in the context
emb_dim = 100
for sample in np.random.randn(n_samples, n_context, emb_dim):
    # Prepare sample with Variable wrap
    sample = Variable(torch.from_numpy(sample.astype(np.float32)))
    net.zero_grad()
    dot_prod = net.forward(sample, 0)
    loss = -dot_prod
    loss.backward()
    opt.step()
    print(loss)

Variable containing:
-0.8111
[torch.FloatTensor of size 1]

Variable containing:
-9.0926
[torch.FloatTensor of size 1]

Variable containing:
 10.3019
[torch.FloatTensor of size 1]

Variable containing:
 23.8313
[torch.FloatTensor of size 1]

Variable containing:
-1.6813
[torch.FloatTensor of size 1]

Variable containing:
 33.3113
[torch.FloatTensor of size 1]

Variable containing:
 17.5252
[torch.FloatTensor of size 1]

Variable containing:
-11.2283
[torch.FloatTensor of size 1]

Variable containing:
 20.7560
[torch.FloatTensor of size 1]

Variable containing:
-19.4437
[torch.FloatTensor of size 1]



# Create dataset

In [7]:
ls -lh ../data/my_data/

total 13G
-rw-r--r-- 1 fogside fogside  8,1G янв 21 18:08 [0m[00mbig_one_file.txt[0m
-rw-r--r-- 1 fogside fogside   40M янв 21 18:22 [00mdict.txt[0m
drwxr-xr-x 3 fogside fogside  4,0K янв 21 18:05 [01;34mlibru[0m/
-rw-r--r-- 1 fogside fogside  3,0M янв 17 17:42 [00mmain_contexts_and_test.txt[0m
-rw-r--r-- 1 fogside fogside  623M янв 17 17:43 [00mmain_wiki_and_contexts.txt[0m
-rw-r--r-- 1 fogside fogside  620M янв 14 19:34 [00mmain_words_wiki_normalized_no_punct.txt[0m
-rw-r--r-- 1 fogside fogside  732M янв 14 18:56 [00mmain_words_wiki.txt[0m
-rw-r--r-- 1 fogside fogside 1019M окт 20 00:10 [00mruwiki_00.txt[0m
-rw-r--r-- 1 fogside fogside  1,1G янв 13 15:06 [00mruwiki_tokenized.txt[0m
drwxrwxr-x 4 fogside fogside  4,0K янв 19 18:03 [01;34mНКРЯ[0m/


In [8]:
ls -lh ../models/

total 3,0G
-rw-r--r-- 1 fogside fogside 1,3G дек  8 17:42 [0m[00mfast_text_model.bin[0m
-rw-r--r-- 1 fogside fogside 587M дек  8 17:42 [00mfast_text_model.vec[0m
-rw-r--r-- 1 fogside fogside 923M янв 22 04:51 [00mmodel_big_one.bin[0m
-rw-r--r-- 1 fogside fogside 171M янв 22 04:51 [00mmodel_big_one.vec[0m


In [None]:
word = 'замок'

In [10]:
from pymystem3 import Mystem
stemmer = Mystem()

In [21]:
def get_all_indexes(lst, word):
    res = []
    i = 0
    while(True):
        try:
            i = lst.index(word, i)
            res.append(i)
            i+=1
        except:
            break
    return res

In [26]:
from tqdm import tqdm

In [41]:
def make_dataset(word, window):
    N = 1669868
    w = stemmer.lemmatize(word)[0]
    counter = 0

    with open("../data/my_data/big_one_file.txt", 'r') as bigf,\
    open("../data/my_data/{}_out.txt".format(word), 'a') as fout:
        for i in tqdm(range(N)):
            line = bigf.readline().split()
            if w in line:
                counter+=1
                idxs = get_all_indexes(line, w)
                fout.write("> "+" ".join([str(i) for i in idxs])+'\n')
                for i in idxs:
                    fout.write(" ".join(line[])+'\n')
    return counter

In [42]:
make_dataset(word='замок')

100%|██████████| 1669868/1669868 [01:34<00:00, 17613.25it/s] 


17379

In [9]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load_word2vec_format("../models/model_big_one.vec", binary=False)

In [None]:
def generate_batch(word):
    

In [37]:
4 in [1,2,3,4]

True

In [38]:
'p' in ['y', 'p']

True