In [2]:
import torch
from torch import nn
import numpy as np
import fasttext as ft
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### process data

In [20]:
data_path = 'DATA/'

In [31]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(data_path + 'rus.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[:-1]:
    input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t ' + target_text + ' \n'
    input_texts.append(input_text)
    target_texts.append(target_text)

num_samples = len(input_texts)
vocab_size = 50000

from itertools import chain
max_len = max(list(chain.from_iterable((len(x.split(' ')), len(y.split(' '))) for x, y in zip(input_texts, target_texts))))

In [33]:
print('Number of samples:', num_samples)
print('Max sequence length for inputs:', max([len(txt.split(' ')) for txt in input_texts]))
print('Max sequence length for outputs:', max([len(txt.split(' ')) for txt in target_texts]))
print('Median sequence length for inputs:', np.median([len(txt.split(' ')) for txt in input_texts]))
print('Median sequence length for outputs:', np.median([len(txt.split(' ')) for txt in target_texts]))

Number of samples: 304513
Max sequence length for inputs: 43
Max sequence length for outputs: 42
Median sequence length for inputs: 6.0
Median sequence length for outputs: 7.0


#### Tokenize

In [36]:
en_tokenizer = Tokenizer(num_words=vocab_size, lower=True, split=' ', oov_token='OOV')
ru_tokenizer = Tokenizer(num_words=vocab_size, lower=True, split=' ', oov_token='OOV')
en_tokenizer.fit_on_texts(input_texts)
ru_tokenizer.fit_on_texts(target_texts)

x_t = np.asarray(en_tokenizer.texts_to_sequences(input_texts))
y_t = np.asarray(ru_tokenizer.texts_to_sequences(target_texts))
print(en_tokenizer.word_index['coffee'], en_tokenizer.word_index['OOV'])
print(ru_tokenizer.word_index['кофе'], ru_tokenizer.word_index['OOV'])

x_t = pad_sequences(x_t, maxlen=max_len, dtype='int32', padding='post', truncating='post', value=0)
y_t = pad_sequences(y_t, maxlen=max_len, dtype='int32', padding='post', truncating='post', value=0)

316 1
239 1


In [41]:
input_texts[0], target_texts[0]

('Go.', '\t Иди. \n')

In [43]:
x_t[0], y_t[0], x_t.shape

(array([37,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0]),
 array([722,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]),
 (304513, 43))

In [53]:
class AttentionRNN(nn.Module):
    def __init__(self, inp_sz, out_sz, em_sz, h_sz, n_l, voc_sz):
        super().__init__()
        self.em_sz, self.h_sz, self.n_l, self.inp_sz, self.out_sz, self.voc_sz = em_sz, h_sz, n_l, inp_sz, out_sz, voc_sz
        # Encoder
        self.enc_em = nn.Embedding(self.voc_sz, self.em_sz)
        self.em_drp = nn.Dropout(0.15)
        self.enc_gru = nn.GRU(self.em_sz, self.h_sz, num_layers=self.n_l, dropout=0.2)
        self.enc_drp = nn.Dropout(0.3)
        self.dec_out = nn.Linear(self.h_sz, self.em_sz, bias=False)
        # Decoder
        self.dec_em = nn.Embedding(self.voc_sz, self.em_sz)
        self.dec_gru = nn.GRU(self.em_sz, self.h_sz, num_layers=self.n_l, dropout=0.2)
        self.dec_drp = nn.Dropout(0.3)
        self.dec_out = nn.Linear(self.em_sz, self.out_sz)
        #self.out.weight.data = self.enc_em.weight.data
        
    def forward(self, inp):
        sl, bs = inp.shape
        h = self.initHidden(bs)
        
        x = self.em_drp(self.em_sz(inp))
        enc_out, h = self.enc_drp(self.enc_em(x, h))
        h = self.out_enc(h)
        
        dec_inp = torch.zeros(bs).long()
        result = []
        # What is this?
        for i in range(self.out_sz):
            emb = self.dec_emb(dec_inp).unsqueeze(0)
            outp, h = self.dec_gru(emb, h)
            outp = self.out(self.dec_drp(outp[0]))
            result.append(outp)
            dec_inp = outp.data.max(1)[1]
            if (dec_inp==1).all():
                break
        return torch.stack(res)
    
    def initHidden(self, bs): 
        # Num_layers, batch size, num hidden
        return torch.zeros(self.nl, bs, self.nh)

In [54]:
em_sz = 300
n_h = 128
n_l = 2

inp_sz = max_len
model = AttentionRNN(inp_sz, vocab_size, em_sz, n_h, n_l, vocab_size)

# TODO

In [9]:

###RUN THROUGH DL2 TRANSLATE NOTEBOOK AND ANSWER THESE QUESTIONS
#########################################
## WHAT IS 

#vecs_enc - 
    # Dict of words, with embedding vectors values
    # https://i.imgur.com/nIELpdY.png
    # https://i.imgur.com/RgBnu4O.png
#itos_enc - 
    # Index to string
    # List of strings, of which the list index is pointing to a word
    # https://i.imgur.com/oq6Kcv1.png
    # https://i.imgur.com/dXSh60V.png
#vecs_dec - Same as vecs_enc, but for dec
#itos_dec - Same as itos_enc but for dec
#########################################
##WHAT DOES create_emb DO
    # Makes an embedding with wiki vectors weights tripled 
## WHAT IS THE sl,bs IN inp.size()
    # bs is batch size
    # sl is seq_len https://i.imgur.com/icfxqv9.png

##WHAT DOES THE FOR LOOP IN FORWARD DO

##WHY DO YOU TAKE WEIGHT DATA OF OUTPUT EMBEDDINGS (IS THAT RELATED TO THE RETURN?)


##########################################
###FIGURE OUT THE LOSS FUNCTION

## WHY DO YOU PAD THE INPUT LIKE THAT

## WHY DO YOU SLICE THE INPUT
##########################################

In [19]:
# TRY TO TRAIN IT, WRITE OWN TRAIN LOOP

In [17]:
# CONVERT IT TO BIDIR

In [15]:
# TEACHER FORCING

In [16]:
# ATTENTION

In [18]:
# ALL

In [20]:
# CONVERT MODEL TO PRDOCTION