In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as fn
from torch.autograd import Variable
import numpy as np

In [2]:
from tqdm import tqdm_notebook

In [424]:
import gc

In [3]:
# Multi-Component word embeddings

class MultiComp(nn.Module):
    def __init__(self, word_embeddings_to_comp, n_comp=3):
        """
        word_embeddings_to_comp: word embeddings of the target word;
                                 shape = (n_words,100);
        
        """
        super(MultiComp, self).__init__()
        self.n_comp = n_comp
        
        n_words, embedding_size = word_embeddings_to_comp.shape
        # expand to duplicate embeddings n_comp times
        expanded_word_embeddings = np.expand_dims(word_embeddings_to_comp, 1)
        comp_embeddings = np.tile(expanded_word_embeddings, [1, n_comp, 1])
        
        # add some noise to components (1/3 of its standard deviations)
        stds = word_embeddings_to_comp.std(axis=1, keepdims=True).reshape([n_words, 1, 1])
        comp_embeddings += np.random.randn(n_words, n_comp, embedding_size)  * stds  / 10
        comp_embeddings = np.random.randn(n_words, n_comp, embedding_size)  * stds  / 10
        comp_embeddings = comp_embeddings.astype(np.float32)
        
        self.n_centroids = np.random.randn(n_words, n_comp, embedding_size)  * stds  / 10
        self.n_centroids = self.n_centroids.astype(np.float32)
        
        # create variable to use autograd
        self.words_comps = nn.Parameter(torch.from_numpy(comp_embeddings))
        
#         # weight matrices for attention (times 2 because concat context and comp)
#         weights = np.random.randn(n_words, embedding_size * 2, 1)
#         # xavier
#         weights = weights * np.sqrt(2 / (embedding_size + n_comp))
#         weights = weights.astype(np.float32)
#         self.att_w = nn.Parameter(torch.from_numpy(weights))
#         self.att_b = nn.Parameter(torch.zeros(n_words, n_comp, 1))
                
    def forward(self, context_embeddigs, word_n):
        """
        context_embeddings: shape = (n_context_words, 100);
        word_n: number of main_word for what the forward is running;
        
        """
        
        # pick word_embeddings and linear layer weights
        w_comps = self.words_comps[word_n]
#         att_w = self.att_w[word_n]
#         att_b = self.att_b[word_n]
        # sum the context across words dim 
        cont_sum = np.mean(context_embeddigs, axis=0)
        cont_sum = cont_sum.astype(np.float32)
#         print(cont_sum.shape)
#         print(self.n_centroids[word_n].shape)
#         cont_sum_repeated = cont_sum.repeat(self.n_comp, 1)
        scores_tmp = np.matmul(self.n_centroids[word_n], cont_sum)
        scores = Variable(torch.from_numpy(scores_tmp))
        scores = fn.softmax(scores, 0)
        values, indices = scores.max(0)
        
        for n in range(self.n_comp):
            self.n_centroids[word_n][n]+=cont_sum*scores[n].data
            self.n_centroids[word_n][n]/=np.sqrt(self.n_centroids[word_n][n]**2)
        
#         self.n_centroids[word_n][indices.data]+=cont_sum
#         self.n_centroids[word_n][indices.data]/=np.sqrt(self.n_centroids[word_n][indices.data]**2)
            
#         print(scores)
#         print(values, indices)

#         att_input = torch.cat([cont_sum_repeated, w_comps], dim=1)
#         att = torch.matmul(att_input, att_w) + att_b
#         att = fn.softmax(att, 0)
#         comps_sum = torch.sum(w_comps * scores.expand_as(w_comps), 0)
        comps_sum = torch.matmul(scores, w_comps)
        cont_sum = Variable(torch.from_numpy(cont_sum))
        dot_prod = torch.matmul(comps_sum, cont_sum.squeeze())/(torch.norm(comps_sum, p=1) * torch.norm(cont_sum, p=1))
#         print(dot_prod)
        #         dot_prod = fn.cosine_embedding_loss(w_comps[indices.data].double(), cont_sum.squeeze(), 1, 0, True)
        return dot_prod, scores

In [504]:
net.n_centroids[0][0].shape

(100,)

# Test simple

In [4]:
w_emb = np.random.randn(3, 100)
net = MultiComp(w_emb)
opt = torch.optim.Adam(net.parameters(), lr=1e-3)


n_samples = 100
n_context = 20 # number of words in the context
emb_dim = 100
for sample in np.random.randn(n_samples, n_context, emb_dim):
    # Prepare sample with Variable wrap
#     sample = Variable(torch.from_numpy(sample.astype(np.float32)))
    net.zero_grad()
    dot_prod, att = net.forward(sample, 0)
    loss = dot_prod
    loss.backward()
    opt.step()
    print(loss)
    print("attn: ", att)

Variable containing:
1.00000e-03 *
  1.8303
[torch.FloatTensor of size 1]

attn:  Variable containing:
 0.2837
 0.3223
 0.3940
[torch.FloatTensor of size 3]

Variable containing:
1.00000e-03 *
 -1.2066
[torch.FloatTensor of size 1]

attn:  Variable containing:
 0.0018
 0.8423
 0.1559
[torch.FloatTensor of size 3]

Variable containing:
1.00000e-03 *
  2.8472
[torch.FloatTensor of size 1]

attn:  Variable containing:
 0.0260
 0.8805
 0.0936
[torch.FloatTensor of size 3]

Variable containing:
1.00000e-03 *
  3.2664
[torch.FloatTensor of size 1]

attn:  Variable containing:
 0.9533
 0.0415
 0.0052
[torch.FloatTensor of size 3]

Variable containing:
1.00000e-04 *
  9.9802
[torch.FloatTensor of size 1]

attn:  Variable containing:
 0.6413
 0.0459
 0.3128
[torch.FloatTensor of size 3]

Variable containing:
1.00000e-04 *
  9.9665
[torch.FloatTensor of size 1]

attn:  Variable containing:
 0.1220
 0.8632
 0.0148
[torch.FloatTensor of size 3]

Variable containing:
1.00000e-03 *
  2.1980
[torch.F

# Create dataset

In [7]:
ls -lh ../data/my_data/

total 13G
-rw-r--r-- 1 fogside fogside  8,1G янв 21 18:08 [0m[00mbig_one_file.txt[0m
-rw-r--r-- 1 fogside fogside   40M янв 21 18:22 [00mdict.txt[0m
drwxr-xr-x 3 fogside fogside  4,0K янв 21 18:05 [01;34mlibru[0m/
-rw-r--r-- 1 fogside fogside  3,0M янв 17 17:42 [00mmain_contexts_and_test.txt[0m
-rw-r--r-- 1 fogside fogside  623M янв 17 17:43 [00mmain_wiki_and_contexts.txt[0m
-rw-r--r-- 1 fogside fogside  620M янв 14 19:34 [00mmain_words_wiki_normalized_no_punct.txt[0m
-rw-r--r-- 1 fogside fogside  732M янв 14 18:56 [00mmain_words_wiki.txt[0m
-rw-r--r-- 1 fogside fogside 1019M окт 20 00:10 [00mruwiki_00.txt[0m
-rw-r--r-- 1 fogside fogside  1,1G янв 13 15:06 [00mruwiki_tokenized.txt[0m
drwxrwxr-x 4 fogside fogside  4,0K янв 19 18:03 [01;34mНКРЯ[0m/


In [8]:
ls -lh ../models/

total 3,0G
-rw-r--r-- 1 fogside fogside 1,3G дек  8 17:42 [0m[00mfast_text_model.bin[0m
-rw-r--r-- 1 fogside fogside 587M дек  8 17:42 [00mfast_text_model.vec[0m
-rw-r--r-- 1 fogside fogside 923M янв 22 04:51 [00mmodel_big_one.bin[0m
-rw-r--r-- 1 fogside fogside 171M янв 22 04:51 [00mmodel_big_one.vec[0m


In [5]:
from pymystem3 import Mystem
stemmer = Mystem()

In [6]:
def get_all_indexes(lst, word):
    res = []
    i = 0
    while(True):
        try:
            i = lst.index(word, i)
            res.append(i)
            i+=1
        except:
            break
    return res

In [7]:
def make_dataset(word, window):
    N = 1669868
    w = stemmer.lemmatize(word)[0]
    counter = 0

    with open("../data/my_data/big_one_file.txt", 'r') as bigf,\
    open("../data/my_data/{}_out.txt".format(word), 'a') as fout:
        for i in tqdm_notebook(range(N)):
            line = bigf.readline().split()
            if w in line:
                idxs = get_all_indexes(line, w)
                for i in idxs:
                    counter+=1
                    # each line is a group of neighbour words with length = 3*window
                    start = max(0, i-1-window) # if 0 is max then all before main word will be selected
                    fout.write(" ".join(line[start:i-1])+" "+" ".join(line[i:i+window])+'\n')
    return counter

In [60]:
make_dataset(word='замок', window=10)

100%|██████████| 1669868/1669868 [01:11<00:00, 23263.62it/s] 


111462

In [8]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load_word2vec_format("../models/model_big_one.vec", binary=False)

In [9]:
with open("../data/my_data/{}_out.txt".format('замок'), 'r') as f:
    lines = f.readlines()

In [10]:
def generate_batch(lines, context_max_len):
    for line in lines:
        line = line.split()
        embedd = []
        for i, w in enumerate(line[:context_max_len]):
            try:
                embedd.append(wv[w])
            except KeyError:
                continue
        yield np.array(embedd)

In [None]:
w_emb = wv['замок'].reshape((1,100))
net = MultiComp(w_emb, 3)
opt = torch.optim.Adam(net.parameters(), lr=1e-4)


n_samples = len(lines)
n_context = 20 # number of words in the context
emb_dim = 100
atts = list()
epoch_num = 3
for epoch in range(epoch_num):
    batch_gen = generate_batch(lines, context_max_len=20)
    pbar = tqdm_notebook(batch_gen, total = n_samples)
    print("epoch_num: ", epoch)
    for n, sample in enumerate(pbar):
        # Prepare sample with Variable wrap
    #     sample = Variable(torch.from_numpy(sample.astype(np.float32)))
        net.zero_grad()
        dot_prod, att = net.forward(sample, 0)
        atts.append(att.data.numpy())
        loss = -dot_prod 
        loss.backward()
        opt.step()
        if n % 100 == 99:
            pbar.set_description("loss {:.3f}".format(float(loss.data.numpy())))

epoch_num:  0



Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/fogside/virtualenvs/py3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/fogside/virtualenvs/py3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






epoch_num:  1



epoch_num:  2


In [502]:
for f in net.parameters():
#     print('data is')
#     print(f.data)
    print('grad is')
    print(f.grad.data.numpy())

grad is
[[[-3.49486129e-07 -3.32365261e-07  7.03760350e-07 -2.97755946e-07
    3.56533036e-07 -5.35514573e-07 -1.10389169e-06 -4.66065700e-07
   -5.82917210e-07 -7.27230713e-07 -2.60619970e-07  8.72017154e-07
   -8.08192112e-07 -1.07066981e-07 -1.75066830e-07 -6.79283175e-07
   -9.17846492e-08  1.09717249e-07 -8.09712787e-07 -2.83074058e-07
   -1.10584521e-10 -3.18478584e-07  1.89186693e-07 -7.15673593e-07
   -6.53958921e-07 -4.84765849e-07  5.89442948e-07  4.24560113e-07
    6.29939223e-07  1.76845361e-07 -1.02231468e-06  4.03644037e-07
   -4.14105955e-07 -6.08837638e-07 -8.30337115e-07  2.56671797e-07
    4.60513974e-07 -4.49053999e-07  9.50191861e-07 -3.75076752e-07
    1.45197220e-07 -7.48199170e-07 -4.16173862e-07 -8.43637110e-07
   -7.23692779e-07 -6.04928402e-07  6.47262880e-07 -5.54983956e-07
    9.16100305e-07 -8.20186301e-07  1.35696598e-06  3.22151266e-07
   -2.76605135e-07 -8.56098382e-07  1.24399762e-07  5.11047290e-07
   -4.81783104e-07 -1.36427346e-07 -3.60368801e-07  6.

In [486]:
atts = np.array(atts)

In [488]:
atts[:20]

array([[2.2929372e-01, 4.3153498e-01, 3.3917129e-01],
       [1.2789839e-03, 9.7458726e-01, 2.4133760e-02],
       [8.6986611e-04, 9.7325903e-01, 2.5871109e-02],
       [1.2564140e-03, 9.4226527e-01, 5.6478307e-02],
       [1.8663254e-03, 9.5647675e-01, 4.1656919e-02],
       [9.5462325e-05, 9.7865313e-01, 2.1251412e-02],
       [8.5233944e-04, 9.6639657e-01, 3.2751065e-02],
       [4.7745951e-04, 9.5825988e-01, 4.1262675e-02],
       [4.7086633e-04, 9.5452482e-01, 4.5004334e-02],
       [8.8591996e-04, 9.5081198e-01, 4.8302125e-02],
       [4.3654311e-04, 9.7933054e-01, 2.0232894e-02],
       [1.1286370e-03, 9.7844708e-01, 2.0424303e-02],
       [1.2178138e-03, 9.4687504e-01, 5.1907163e-02],
       [1.1501533e-03, 9.8326588e-01, 1.5583949e-02],
       [5.2133296e-04, 9.8449069e-01, 1.4987965e-02],
       [1.9799334e-04, 9.6790385e-01, 3.1898163e-02],
       [8.5847487e-04, 9.8333067e-01, 1.5810886e-02],
       [2.6864768e-03, 9.5997661e-01, 3.7336886e-02],
       [2.6363318e-04, 9.820

In [495]:
wv.similar_by_vector(net.n_centroids[0][2], 30)

[('ослица', 0.5227832794189453),
 ('ниневия', 0.509689211845398),
 ('калант', 0.505122721195221),
 ('телец', 0.5043264627456665),
 ('невероятие', 0.4914008677005768),
 ('слоновник', 0.488711953163147),
 ('невон', 0.48820817470550537),
 ('горошина', 0.4879819452762604),
 ('савский', 0.48740366101264954),
 ('одногорбый', 0.48712512850761414),
 ('невероятно', 0.48405176401138306),
 ('бегемотовый', 0.4807841181755066),
 ('порфировый', 0.48065459728240967),
 ('минерал', 0.4805746078491211),
 ('безобразный', 0.48055922985076904),
 ('слон', 0.4802359938621521),
 ('орода', 0.47562527656555176),
 ('дромадер', 0.47427913546562195),
 ('крокодиль', 0.47348499298095703),
 ('бороздчатый', 0.47335511445999146),
 ('мира', 0.47249507904052734),
 ('нанда', 0.47133535146713257),
 ('гладенький', 0.46967971324920654),
 ('слизень', 0.4691134989261627),
 ('подий', 0.46752920746803284),
 ('пластрон', 0.46675676107406616),
 ('грушевидный', 0.466746985912323),
 ('монструозный', 0.4666191637516022),
 ('ниневий',

In [499]:
wv.similar_by_vector(net.words_comps[0].data.numpy()[0], 30)

[('либуша', 0.49343714118003845),
 ('мечькин', 0.49061936140060425),
 ('альхен', 0.4888174533843994),
 ('бонтон', 0.4863034784793854),
 ('тришатов', 0.48624271154403687),
 ('бертольди', 0.48591262102127075),
 ('маллинер', 0.48258763551712036),
 ('полчасика', 0.48102128505706787),
 ('часок', 0.4807071089744568),
 ('еспер', 0.47727519273757935),
 ('зеленуда', 0.4760782718658447),
 ('часик', 0.4747023284435272),
 ('мамзель', 0.47147342562675476),
 ('марихен', 0.4693729281425476),
 ('алексевна', 0.4686300456523895),
 ('польди', 0.4684569835662842),
 ('даровщинка', 0.4683907628059387),
 ('пользительно', 0.4682873785495758),
 ('пересаливать', 0.46739619970321655),
 ('пунтило', 0.466889351606369),
 ('крыжовенный', 0.4637065529823303),
 ('мараскин', 0.46335655450820923),
 ('максинька', 0.4618507921695709),
 ('фреди', 0.4617885649204254),
 ('дюбона', 0.46126455068588257),
 ('ленивица', 0.45895352959632874),
 ('подразнить', 0.4586748480796814),
 ('покушать', 0.45825403928756714),
 ('вотренный', 