In [1]:
from tqdm import tqdm
import random
import math
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset
from fastai import DataBunch
from fastai import Learner
from fastai import Callback
from fastai.callback import Stepper
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.backends.cudnn.enabled = False 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### process data

In [2]:
data_path = 'DATA/'

In [3]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(data_path + 'rus.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[:-1]:
    input_text, target_text = line.split('\t')
    # We use "_bos_" as the "start of sequence" token
    # for the targets, and "_eos_" as "end of sequence" token
    input_text = '_bos_ ' + input_text + ' _eos_'
    target_text = '_bos_ ' + target_text + ' _eos_'
    input_texts.append(input_text)
    target_texts.append(target_text)

num_samples = len(input_texts)
vocab_size = 50000

from itertools import chain
max_len = max(list(chain.from_iterable((len(x.split(' ')), len(y.split(' '))) for x, y in zip(input_texts, target_texts))))

In [4]:
print('Number of samples:', num_samples)
print('Max sequence length for inputs:', max([len(txt.split(' ')) for txt in input_texts]))
print('Max sequence length for outputs:', max([len(txt.split(' ')) for txt in target_texts]))
print('Median sequence length for inputs:', np.median([len(txt.split(' ')) for txt in input_texts]))
print('Median sequence length for outputs:', np.median([len(txt.split(' ')) for txt in target_texts]))

Number of samples: 304513
Max sequence length for inputs: 45
Max sequence length for outputs: 42
Median sequence length for inputs: 8.0
Median sequence length for outputs: 7.0


#### Tokenize

In [6]:
en_tokenizer = Tokenizer(num_words=vocab_size, lower=True, split=' ', oov_token='OOV', filters='')
ru_tokenizer = Tokenizer(num_words=vocab_size, lower=True, split=' ', oov_token='OOV', filters='')
en_tokenizer.fit_on_texts(input_texts)
ru_tokenizer.fit_on_texts(target_texts)

x_t = np.asarray(en_tokenizer.texts_to_sequences(input_texts))
y_t = np.asarray(ru_tokenizer.texts_to_sequences(target_texts))
print(en_tokenizer.word_index['coffee'], en_tokenizer.word_index['OOV'])
print(ru_tokenizer.word_index['кофе'], ru_tokenizer.word_index['OOV'])
print(en_tokenizer.word_index['_bos_'], en_tokenizer.word_index['_bos_'])
print(ru_tokenizer.word_index['_eos_'], ru_tokenizer.word_index['_eos_'])

x_t = pad_sequences(x_t, maxlen=max_len, dtype='int32', padding='post', truncating='post', value=0)
y_t = pad_sequences(y_t, maxlen=max_len, dtype='int32', padding='post', truncating='post', value=0)

761 1
711 1
2 2
3 3


In [7]:
input_texts[0], target_texts[0]

('_bos_ Go. _eos_', '_bos_ Иди. _eos_')

In [8]:
x_t[0], y_t[0], x_t.shape

(array([  2, 205,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0]),
 array([    2, 11701,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0]),
 (304513, 45))

### Dataloader

In [9]:
# Split
#crop = 40000
#x_t = x_t[0:crop]
#y_t = y_t[0:crop]
x_trn, x_val, y_trn, y_val = train_test_split(x_t, y_t, test_size=0.1, random_state=42)

In [26]:
bs=16

class TokDataset(Dataset):
    def __init__(self, x, y):
        self.x = x; self.y = y
        self.len = len(self.x)
        self.x_data = torch.from_numpy(self.x); self.x_data = self.x_data.long()
        self.y_data = torch.from_numpy(self.y); self.y_data = self.y_data.long()
        print('x shape', self.x_data.shape)
        print('y shape', self.y_data.shape)
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len
    
ds = TokDataset(x_trn, y_trn)
ds_val = TokDataset(x_val, y_val)
dl = torch.utils.data.DataLoader(dataset=ds, batch_size=bs, shuffle=True, num_workers=0)
dl_val = torch.utils.data.DataLoader(dataset=ds_val, batch_size=bs, shuffle=True, num_workers=0)

x shape torch.Size([274061, 45])
y shape torch.Size([274061, 45])
x shape torch.Size([30452, 45])
y shape torch.Size([30452, 45])


In [27]:
x_trn.shape

(274061, 45)

In [28]:
x_trn.transpose().shape

(45, 274061)

In [29]:
# Values for testing
test_values = iter(dl)
xs, ys = next(test_values)
xs, ys = xs.t(), ys.t()
print(xs.shape, ys.shape)

torch.Size([45, 16]) torch.Size([45, 16])


In [30]:
# Make a random variable, as a Parameter (Parameter tells PyTorch to learn the weights for that tensor)
def rand_p(*sz): return nn.Parameter(torch.randn(sz)/math.sqrt(sz[0]))

In [31]:
class AttentionRNN(nn.Module):
    def __init__(self, inp_sz, out_sz, em_sz, h_sz, n_l, voc_sz, t_frc=0.95):
        super().__init__()
        self.em_sz, self.h_sz, self.n_l, self.inp_sz, self.out_sz, self.voc_sz, self.t_frc = em_sz, h_sz, n_l, inp_sz, out_sz, voc_sz, t_frc
        # Encoder
        self.enc_em = nn.Embedding(voc_sz, em_sz)
        self.em_drp = nn.Dropout(0.15)
        self.enc_gru = nn.GRU(em_sz, h_sz, num_layers=n_l, dropout=0.2, bidirectional=True)
        # h_sz*2 because of bidir
        self.enc_out = nn.Linear(h_sz*2, em_sz, bias=False)
        self.enc_drp = nn.Dropout(0.3)
        # Decoder
        self.dec_em = nn.Embedding(voc_sz, em_sz)
        self.dec_gru = nn.GRU(em_sz, em_sz, num_layers=n_l, dropout=0.2)
        self.dec_out = nn.Linear(em_sz, voc_sz)
        
        # Attention weights
        # Initialize a random weight matrix, without bias (hidden layer), *2 because bidir
        self.W1 = rand_p(h_sz*2, em_sz)
        # Linear layer with bias
        self.l2 = nn.Linear(em_sz, em_sz)
        # Linear layer with bias with the last layer concatenated to the embeddings 
        self.l3 = nn.Linear(em_sz+h_sz*2, em_sz)
        # Initialize a random weight matrix for the output, without bias (hidden layer)
        self.V = rand_p(em_sz)
        
        # If using pretrained word embeddings:
        #self.out.weight.data = self.enc_em.weight.data
        
    def forward(self, inp, y=None, ret_attn=False):
        # seq_len, bs
        sl, bs = inp.shape
        h = self.initHidden(bs)
        x = self.em_drp(self.enc_em(inp))
        enc_outp, h = self.enc_gru(x, h)
        # Reshape, to bidir, n_l x bs x -1
        # Permute using indexes to change order of shape (n_l x bs x bidir(2) x n_h)
        # Contiguous to copy the tensor
        # Reshape to flatten the bidir layers, final shape n_l x bs x n_h*2
        h = h.view(2,self.n_l,bs,-1).permute(0,2,1,3).contiguous().view(2,bs,-1)
        # Is it really supposed to be dropout on h instead of enc_out?
        h = self.enc_out(self.enc_drp(h))
        
        dec_inp = torch.zeros(bs).long().cuda()
        result, attentions = [], []
        # Matrix multiply the encoder output with the W1 weight matrix
        # This is basically a linear layer pass through, without the bias
        w1e = enc_outp @ self.W1
        # Loop over max output size
        for i in range(self.out_sz):
            # Pass the hidden state through a hidden layer with bias
            w2h = self.l2(h[-1])
            # Add the attention weights of all the outputs from enc_out (w1e), and the last hidden state (w2h)
            # And pass through a non linear activation function
            u = torch.tanh(w1e + w2h)
            # Pass the activations through the last layer and activate with a softmax
            a = F.softmax(u @ self.V, dim=0)
            # Store attentions, for returning them for visualization
            attentions.append(a)
            # Instead of just using the last hidden state (like if we don't use attention), 
            # multiply attention with the decoder output and sum it
            Xa = (a.unsqueeze(2) * enc_outp).sum(0)
            
            # Get embedding
            emb = self.dec_em(dec_inp)
            # Pass the concat the decoder embeddings with the attentions
            enc_wgt = self.l3(torch.cat([emb, Xa], dim=1))
            
            # Pass hidden state from (initially) self.enc_out and attentioned embedding to self.dec_gru
            outp, h = self.dec_gru(enc_wgt.unsqueeze(0), h)
            # Pass decoder output to dense, get probabilities of words in len of vocab_size
            outp = self.dec_out(self.enc_drp(outp[0]))
            # Teacher forcing
            
            # Append the output to result
            result.append(outp)
            # Get new input
            # Get ind of biggest value for each batch 
            dec_inp = outp.data.max(1)[1]
            # If all are padding break and return
            if (dec_inp==0).all():
                break
            # Teacher forcing (replaces predicted value with ground truth)
            if (y is not None and self.t_frc > 0) and (random.random() < self.t_frc): 
                # If the len of i >= len of y, break and return
                if i >= len(y): break
                # else change the next dec inp to be the ground truth y
                dec_inp = y[i]
            
        # Turn list of tensors into tensor
        result = torch.stack(result)
        # If returning attentions
        if ret_attn: result = res, torch.stack(attentions)
        return result
    
    def initHidden(self, bs): 
        # Num_layers, batch size, num hidden
        # n_l*2 because of bidir
        return torch.zeros(self.n_l*2, bs, self.h_sz).cuda()

In [32]:
em_sz = 150
n_h = 64
n_l = 2

inp_sz = max_len
model = AttentionRNN(inp_sz, max_len, em_sz, n_h, n_l, vocab_size).cuda()

In [33]:
test_out = model(xs.cuda())

In [34]:
print(test_out.shape)
# Padding example
print(torch.nn.functional.pad(test_out, (0,0,0,0,0,2)).shape)
# Last seq_len value is padded, so the sum of the vocab_size probs is 0
print(int(torch.sum(torch.nn.functional.pad(test_out, (0,0,0,0,0,2))[-1].view(-1)).data))

torch.Size([45, 16, 50000])
torch.Size([47, 16, 50000])
0


In [35]:
# LOSS FUNCTION
def seq2seq_loss(input, target):
    # seq_len, bs
    sl,bs = target.size()
    # seq_len, bs, num_channels (vocab size)
    sl_in,bs_in,nc = input.size()
    # If the output sl is smaller than the actual sl, pad it.
    # The shape of input is 'sl x bs x vocab_sz'
    # The reason why have more 0's for padding, is because in PyTorch
    # You have padding on the left, and right, so none on sl or bs or nc left side, but pad on right
    # In this case, pad seq_len on the right by the difference between grount truth sl and input sl
    if sl>sl_in: input = F.pad(input, (0,0,0,0,0,sl-sl_in))
    # If its too long, just crop it
    input = input[:sl]
    # Cross entropy expects a rank 2 tensor, so flatten the seq_len
    return F.cross_entropy(input.view(-1,nc), target.view(-1))#, ignore_index=1)

### fastai DataBunch and fit

In [None]:
d_bunch = DataBunch(dl, dl_val)
learn = Learner(d_bunch, model, loss_fn=seq2seq_loss)

In [None]:
learn.lr_find()
learn.recorder.plot()

In [36]:
lr=1e-3

In [None]:
# fast.ai custom step function callback
class Seq2seqStepper(Stepper):
    def step(self, xs, y, t_frc):
        super().step()
        t_frc = (10 - epoch)*0.1 if epoch>0 else 0

In [None]:
class Seq2seqCallback(Callback):
    def step(self, xs, y, t_frc):
        super().step()
        t_frc = (10 - epoch)*0.1 if epoch>0 else 0

In [None]:
def step(self, xs, y, t_frc):
    super().step()
    t_frc = (10 - epoch)*0.1 if epoch>0 else 0

In [None]:
stepper = Stepper([0, 10], 1, step)

In [30]:
# TODO use clr 
# fit_one_cycle fits a model following the 1cycle policy.
#learn.fit_one_cycle(2, lr)
#learn.fit(2, lr, callbacks=[stepper])
learn.fit(2, lr)

NameError: name 'learn' is not defined

In [29]:
learn.save('initial')

NameError: name 'learn' is not defined

In [None]:
learn.load('initial')

### Non fastai fit

In [37]:
def fit(epochs, model, train_dl, crit, opt, verb=200):
    for ep in range(epochs):
        model.train()
        for i, data in tqdm(enumerate(train_dl)):
            x, y = data
            x = x.t().cuda(); y = y.t().cuda()

            y_h = model(x, y=y)#, y=y)
            loss = crit(y_h, y)

            if i % verb == 0:
                print(f' Epoch: {ep} | b_loss: {loss.item():.{4}f}')

            opt.zero_grad()
            loss.backward()
            opt.step()

In [38]:
opt = torch.optim.Adam(model.parameters(), 1e-3)
fit(20, model, dl, seq2seq_loss, opt)

0it [00:00, ?it/s]

 Epoch: 0 | b_loss: 10.8848


200it [01:01,  3.05it/s]

 Epoch: 0 | b_loss: 8.7994


400it [02:07,  3.20it/s]

 Epoch: 0 | b_loss: 9.2658


600it [03:14,  2.88it/s]

 Epoch: 0 | b_loss: 8.7658


800it [04:23,  3.03it/s]

 Epoch: 0 | b_loss: 8.7543


1000it [05:33,  2.95it/s]

 Epoch: 0 | b_loss: 8.5093


1200it [06:43,  2.99it/s]

 Epoch: 0 | b_loss: 8.7677


1400it [07:52,  3.00it/s]

 Epoch: 0 | b_loss: 8.8840


1600it [09:02,  2.93it/s]

 Epoch: 0 | b_loss: 9.1878


1800it [10:11,  2.66it/s]

 Epoch: 0 | b_loss: 7.4450


2000it [11:22,  2.86it/s]

 Epoch: 0 | b_loss: 8.7255


2200it [12:33,  2.93it/s]

 Epoch: 0 | b_loss: 8.8403


2400it [13:42,  3.05it/s]

 Epoch: 0 | b_loss: 8.7780


2600it [14:49,  3.00it/s]

 Epoch: 0 | b_loss: 8.1577


2800it [15:56,  2.91it/s]

 Epoch: 0 | b_loss: 8.7477


3000it [17:03,  3.04it/s]

 Epoch: 0 | b_loss: 7.0339


3200it [18:09,  2.95it/s]

 Epoch: 0 | b_loss: 7.8781


3400it [19:17,  2.82it/s]

 Epoch: 0 | b_loss: 8.4510


3600it [20:28,  2.84it/s]

 Epoch: 0 | b_loss: 8.4181


3800it [21:38,  2.74it/s]

 Epoch: 0 | b_loss: 7.9696


4000it [22:49,  2.79it/s]

 Epoch: 0 | b_loss: 9.1460


4200it [23:58,  2.93it/s]

 Epoch: 0 | b_loss: 8.2328


4400it [25:08,  2.90it/s]

 Epoch: 0 | b_loss: 8.1694


4600it [26:17,  2.95it/s]

 Epoch: 0 | b_loss: 8.4985


4800it [27:27,  2.79it/s]

 Epoch: 0 | b_loss: 8.8607


5000it [28:39,  2.80it/s]

 Epoch: 0 | b_loss: 8.3815


5200it [29:47,  2.84it/s]

 Epoch: 0 | b_loss: 8.7620


5400it [30:59,  3.07it/s]

 Epoch: 0 | b_loss: 8.9617


5600it [32:10,  2.85it/s]

 Epoch: 0 | b_loss: 8.9983


5800it [33:20,  3.08it/s]

 Epoch: 0 | b_loss: 7.5862


6000it [34:30,  3.12it/s]

 Epoch: 0 | b_loss: 8.2139


6200it [35:41,  2.77it/s]

 Epoch: 0 | b_loss: 8.4291


6400it [36:51,  3.18it/s]

 Epoch: 0 | b_loss: 9.1630


6600it [38:03,  2.83it/s]

 Epoch: 0 | b_loss: 8.6863


6800it [39:14,  2.84it/s]

 Epoch: 0 | b_loss: 8.4808


7000it [40:25,  2.91it/s]

 Epoch: 0 | b_loss: 8.8717


7200it [41:37,  2.98it/s]

 Epoch: 0 | b_loss: 7.9806


7400it [42:48,  2.78it/s]

 Epoch: 0 | b_loss: 7.4051


7600it [43:58,  2.93it/s]

 Epoch: 0 | b_loss: 8.1614


7800it [45:08,  2.76it/s]

 Epoch: 0 | b_loss: 7.7845


8000it [46:19,  2.81it/s]

 Epoch: 0 | b_loss: 9.0159


8200it [47:29,  2.83it/s]

 Epoch: 0 | b_loss: 8.7711


8400it [48:38,  2.85it/s]

 Epoch: 0 | b_loss: 8.2862


8600it [49:48,  2.86it/s]

 Epoch: 0 | b_loss: 8.7885


8800it [50:56,  2.87it/s]

 Epoch: 0 | b_loss: 8.2622


9000it [52:03,  3.06it/s]

 Epoch: 0 | b_loss: 9.0250


9200it [53:10,  2.92it/s]

 Epoch: 0 | b_loss: 8.8666


9400it [54:19,  2.91it/s]

 Epoch: 0 | b_loss: 8.9326


9600it [55:31,  2.66it/s]

 Epoch: 0 | b_loss: 7.9884


9800it [56:39,  2.79it/s]

 Epoch: 0 | b_loss: 9.1204


10000it [57:49,  2.99it/s]

 Epoch: 0 | b_loss: 8.5295


10200it [59:00,  2.72it/s]

 Epoch: 0 | b_loss: 7.6409


10400it [1:00:11,  2.68it/s]

 Epoch: 0 | b_loss: 8.4928


10600it [1:01:21,  2.87it/s]

 Epoch: 0 | b_loss: 8.4206


10800it [1:02:31,  2.87it/s]

 Epoch: 0 | b_loss: 9.0280


11000it [1:03:42,  2.93it/s]

 Epoch: 0 | b_loss: 8.9795


11200it [1:04:52,  2.75it/s]

 Epoch: 0 | b_loss: 8.6418


11400it [1:06:01,  2.89it/s]

 Epoch: 0 | b_loss: 8.1720


11600it [1:07:12,  2.75it/s]

 Epoch: 0 | b_loss: 6.7015


11800it [1:08:23,  2.79it/s]

 Epoch: 0 | b_loss: 8.0279


12000it [1:09:33,  3.08it/s]

 Epoch: 0 | b_loss: 6.3488


12200it [1:10:43,  2.63it/s]

 Epoch: 0 | b_loss: 8.3256


12400it [1:11:53,  2.85it/s]

 Epoch: 0 | b_loss: 8.3146


12557it [1:12:47,  2.91it/s]

KeyboardInterrupt: 

# TODO

In [None]:

###RUN THROUGH DL2 TRANSLATE NOTEBOOK AND ANSWER THESE QUESTIONS
#########################################
## WHAT IS 

#vecs_enc - 
    # Dict of words, with embedding vectors values
    # https://i.imgur.com/nIELpdY.png
    # https://i.imgur.com/RgBnu4O.png
#itos_enc - 
    # Index to string
    # List of strings, of which the list index is pointing to a word
    # https://i.imgur.com/oq6Kcv1.png
    # https://i.imgur.com/dXSh60V.png
#vecs_dec - Same as vecs_enc, but for dec
#itos_dec - Same as itos_enc but for dec
#########################################
##WHAT DOES create_emb DO
    # Makes an embedding with wiki vectors weights tripled 
## WHAT IS THE sl,bs IN inp.size()
    # bs is batch size
    # sl is seq_len https://i.imgur.com/icfxqv9.png

##WHAT DOES THE FOR LOOP IN FORWARD DO
    # Get encoder input values from embeddings
    # Send enc hidden and embedding to decoder gru
    # Get highest value per patch
    # Append to result

##WHY DO YOU TAKE WEIGHT DATA OF OUTPUT EMBEDDINGS (IS THAT RELATED TO THE RETURN?)
# if you aren't using pre-trained embeddings then it's not really doing anything, all it's doing is making sure the weights of the embedding layer and the weights of the output layer are initialized to the same values
# it's sometimes referred to as "weight tying"
# https://arxiv.org/pdf/1611.01462.pdf
# however, in this case they're only initialized to the same value and will probably end up  diverging to different sets of weights

##########################################
###FIGURE OUT THE LOSS FUNCTION
    # Commented
    ## WHY DO YOU PAD THE INPUT LIKE THAT
        # Commented
    ## WHY DO YOU SLICE THE INPUT
        # Commented
##########################################

In [None]:
# CONVERT IT TO BIDIR
    # Done

In [None]:
# TEACHER FORCING
    # Done

In [None]:
# ATTENTION

In [None]:
# ALL

In [None]:
# TRAIN PROPERLY

In [None]:
# CONVERT MODEL TO PRDOCTION

### Inference

In [39]:
en_itos = dict(map(reversed, en_tokenizer.word_index.items()))
ru_itos = dict(map(reversed, ru_tokenizer.word_index.items()))

In [40]:
x,y = next(iter(dl_val))
probs = model(x.cuda())
preds = probs.max(2)[1].cpu().numpy()
x,y = x.cpu().numpy(), y.cpu().numpy()

for i in range(1,10):
    print(' '.join([en_itos[o] for o in x[i, :] if o != 0]))
    print(' '.join([ru_itos[o] for o in y[i, :] if o != 0]))
    print(' '.join([ru_itos[o] for o in preds[i,:] if o!=0]))
    print()

_bos_ what happened to the other one? _eos_
_bos_ а что с другой? _eos_
OOV надеюсь, OOV OOV OOV OOV в когда это я я я я я я я я я я я я я я я я я я я я я я я я я я я я я я я я я я я я

_bos_ do you miss boston? _eos_
_bos_ скучаете по бостону? _eos_
OOV что OOV OOV OOV OOV школе у OOV в думал, тебя буду не не не не не не не не буду не хотел не не не не не не не хотел не не не не хочу стал не не что-то бы не хотел не

_bos_ how many schools are there in your city? _eos_
_bos_ в твоем городе сколько OOV _eos_
OOV я чтобы вам в OOV месте меня от этом _eos_ скрывает. сделал. сказал? прав. ел. хотел знаю. знаю. прав. хотел найдёт. прав. сказать. хотел буду собираюсь буду буду хотел буду да. хотел прав. прав. счастлив. помочь. согласен. могу буду нравишься. счастлив. сказал? сказать. видел.

_bos_ it became dark before i knew it. _eos_
_bos_ я и заметить не OOV как стемнело. _eos_
OOV сделал бы слишком последний меня не есть OOV _eos_ _eos_ _eos_ _eos_ _eos_ _eos_ сделать. _eos_ _eos_ _eos_