In [1]:
from fastai.text.all import *
import os
import pandas as pd
%config Completer.use_jedi = False
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, TabularDataset, BucketIterator
import sys
import spacy
import random
if not sys.warnoptions:
    import warnings

    warnings.simplefilter("ignore")

debug_print = False

In [2]:
# Spacy tokenizer
spacy_eng = spacy.load('en')

def tokenizer_eng(text):
    return[tok.text for tok in spacy_eng.tokenizer(text)]
tokenizer = tokenizer_eng

In [3]:
# If there is a GPU available we will use that for the computations
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# Specifies how the source text be processed in Torchtext
src_txt = Field(sequential=True,
                use_vocab=True,
                tokenize=tokenizer,
                init_token='<sos>',
                eos_token='<eos>',
                lower=False)

In [5]:
# Specifies how each target text should be processed in Torchtext
trg_txt = Field(sequential=True,
                     use_vocab=True,
                     tokenize=tokenizer,
                     init_token='<sos>',
                     eos_token='<eos>',
                     lower=False)

In [6]:
fields = {'src_english': ('src_seq', src_txt), 'trg_french': ('trg_seq', trg_txt)}

train_data, valid_data = TabularDataset.splits(path='data/',
                                              train='train_en_fr.csv',
                                              validation='valid_en_fr.csv',
                                              format='csv',
                                              fields=fields)

In [7]:
# Build the vocab using our custom character embeddings
src_txt.build_vocab(train_data,max_size=10000, min_freq=3)

In [8]:
# Build the vocab using our custom character embeddings
trg_txt.build_vocab(train_data,max_size=10000, min_freq=3)

In [9]:
batch_size = 64
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src_seq),
    device=device
)

In [15]:

dls = DataLoaders(train_iterator, valid_iterator)


In [50]:
train_iterator??

In [35]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_p):
        super(Encoder,self).__init__()
        '''
            input_size:     size of the input vocabulary
            embedding_size: size of each word embedding
            hidden_size:    size of the hidden layer
            num_layers:     number of layers in our encoder lstm
            dropout:        the probability for our dropout
        '''
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout_p)
        
    def forward(self,x):

        embedding = self.dropout(self.embedding(x))
        
        outputs, (hidden, cell) = self.rnn(embedding)
   
        return hidden,cell


In [36]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_p):
        super(Decoder,self).__init__()
        '''
            input_size:     size of the input vocabulary
            embedding_size: size of each word embedding
            hidden_size:    size of the hidden layer
            output_size:    size of the output vocabulary
            num_layers:     number of layers in our encoder lstm
            dropout:        the probability for our dropout
        '''
        
        self.output_size = output_size
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout=nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout_p)
        self.fc = nn.Linear(hidden_size,output_size)
    
    def forward(self,x,hidden,cell):
        # shape of x: (N) but we want (1,N)
        x = x.unsqueeze(0)
        
        # embedding shape: (1, N, embedding_size)
        embedding = self.dropout(self.embedding(x))
        
        # shape of outputs: (1, N, hidden_size)
        outputs, (hidden,cell) = self.rnn(embedding,(hidden,cell))
        
        # shape of predictions: (1, N, length_of_vocab)
        predictions = self.fc(outputs)
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell
        

In [37]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder,decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        
        target_len = target.shape[0]
        
        target_vocab_size = self.decoder.output_size
  

        outputs = torch.zeros(target_len,batch_size,target_vocab_size).to(device)

        hidden, cell = self.encoder(source)
        
        # Grab the start token
        x = target[0]
        
        for t in range(1,target_len):
 
            output, hidden, cell = self.decoder(x,hidden,cell)

            outputs[t] = output
    
            best_guess = output.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess
        
        return outputs

In [38]:
class TeacherForcingCallback(Callback):
    def __init__(self, teacher_forcing_ratio=0):
        self.teacher_forcing_ratio = teacher_forcing_ratio
    """
    Callback that sends the y's to the model too
    """
    def before_batch(self):
        # print('before_batch 2')
        x,y = self.x, self.y
        self.learn.xb = (x,y,self.teacher_forcing_ratio)
        

In [39]:
class LearningRatePrinter(Callback):
    
    def before_train(self):
        lr = self.opt.hypers
        print(f' before_train learning_rate: {lr}')


In [40]:
class DebuggerCallBack(Callback):
    def on_train_begin(self):
        import pdb
        pdb.set_trace()

In [41]:
def seq2seq_acc(out, targ, pad_idx=1):
    out = out.argmax(2)
    targ_len = targ.size()
    out_len= out.size()
    if targ_len>out_len: out  = F.pad(out,  (0,0,0,targ_len-out_len,0,0), value=pad_idx)
    if out_len>targ_len: targ = F.pad(targ, (0,out_len-targ_len,0,0), value=pad_idx)
    return (out==targ).float().mean()

In [42]:
# Training hyperparameters
num_epoc = 10
learning_rate = 0.0001
batch_size = 64

In [43]:
# Model hyperparameters

#INPUT_DIM = len(dls.train.vocab[0])
INPUT_DIM = len(src_txt.vocab)
#OUTPUT_DIM = len(dls.train.vocab[1])
OUTPUT_DIM = len(trg_txt.vocab)

ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
HID_DIM = 1024
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [44]:
enc = Encoder(INPUT_DIM,  ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, OUTPUT_DIM, N_LAYERS, DEC_DROPOUT)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec).to(device)

In [47]:
criterion = CrossEntropyLossFlat(ignore_index=1)

In [48]:

learn = Learner(dls, model, loss_func=criterion, metrics=[seq2seq_acc]).to_fp16()
#learn = Learner(dls, model, loss_func=criterion, metrics=[seq2seq_acc],cbs=TeacherForcingCallback(teacher_forcing_ratio=0.4)).to_fp16()

In [49]:
learn.fit(
    lr=0.0001,
    n_epoch=10
)

epoch,train_loss,valid_loss,seq2seq_acc,time


TypeError: 'Batch' object is not subscriptable

In [None]:
model.eval()

In [None]:
translate_me = 'A group of men are loading cotton onto a truck'

In [None]:
dl = learn.dls.test_dl([translate_me],batch_size=1)

In [None]:
dl.show_batch()

In [None]:
sentence_tensor = torch.LongTensor(dl.dataset[0][0]).unsqueeze(1).to(device)

In [None]:
sentence_tensor.shape

In [None]:
with torch.no_grad():
    hidden, cell = model.encoder(sentence_tensor)

max_length=100
# Add xxbos
outputs = [2]

for _ in range(max_length):
    previous_word = torch.LongTensor([outputs[-1]]).to(device)

    with torch.no_grad():
        output, hidden, cell = model.decoder(previous_word, hidden, cell)
        best_guess = output.argmax(1).item()
        print(best_guess)
        outputs.append(best_guess)
    print(outputs)
    # Model predicts it's the end of the sentence
    if output.argmax(1).item() == 3:
        break

In [None]:
learn.dls.decode(outputs)

# learn.dls.vocab