In [1]:
import pandas as pd
import os
%config Completer.use_jedi = False

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, TabularDataset, BucketIterator
import sys
import spacy
import random

In [3]:
if not sys.warnoptions:
    import warnings

    warnings.simplefilter("ignore")

debug_print = False

In [4]:
# Spacy tokenizer
spacy_eng = spacy.load('en')

def tokenizer_eng(text):
    return[tok.text for tok in spacy_eng.tokenizer(text)]

In [5]:
tokenizer = tokenizer_eng

In [6]:
# If there is a GPU available we will use that for the computations
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# Specifies how the source text be processed in Torchtext
src_txt = Field(sequential=True,
                use_vocab=True,
                tokenize=tokenizer,
                init_token='<sos>',
                eos_token='<eos>',
                lower=False)

In [8]:
# Specifies how each target text should be processed in Torchtext
trg_txt = Field(sequential=True,
                     use_vocab=True,
                     tokenize=tokenizer,
                     init_token='<sos>',
                     eos_token='<eos>',
                     lower=False)

In [9]:
fields = {'src_english': ('src_seq', src_txt), 'trg_french': ('trg_seq', trg_txt)}

train_data, valid_data = TabularDataset.splits(path='data/',
                                              train='train_en_fr.csv',
                                              validation='valid_en_fr.csv',
                                              format='csv',
                                              fields=fields)

In [10]:
# Build the vocab using our custom character embeddings
src_txt.build_vocab(train_data,max_size=10000, min_freq=3)

In [11]:
# Build the vocab using our custom character embeddings
trg_txt.build_vocab(train_data,max_size=10000, min_freq=3)

In [12]:
len(src_txt.vocab)

4727

In [13]:
len(trg_txt.vocab)

5032

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_p):
        super(Encoder,self).__init__()
        '''
            input_size:     size of the input vocabulary
            embedding_size: size of each word embedding
            hidden_size:    size of the hidden layer
            num_layers:     number of layers in our encoder lstm
            dropout:        the probability for our dropout
        '''
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout_p)
        
    def forward(self,x):
        # x shape: (seq_len, batch_size)    
        embedding = self.dropout(self.embedding(x))
        
        # embedding shape: (seq_length, batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        
        return hidden,cell


In [15]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_p):
        super(Decoder,self).__init__()
        '''
            input_size:     size of the input vocabulary
            embedding_size: size of each word embedding
            hidden_size:    size of the hidden layer
            output_size:    size of the output vocabulary
            num_layers:     number of layers in our encoder lstm
            dropout:        the probability for our dropout
        '''
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout=nn.Dropout(dropout_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout_p)
        self.fc = nn.Linear(hidden_size,output_size)
    
    def forward(self,x,hidden,cell):
        # shape of x: (N) but we want (1,N)
        x = x.unsqueeze(0)
        
        # embedding shape: (1, N, embedding_size)
        embedding = self.dropout(self.embedding(x))
        
        # shape of outputs: (1, N, hidden_size)
        outputs, (hidden,cell) = self.rnn(embedding,(hidden,cell))
        
        # shape of predictions: (1, N, length_of_vocab)
        predictions = self.fc(outputs)
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell
        

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder,decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
   
        target_len = target.shape[0]
   
        target_vocab_size = len(trg_txt.vocab)
 
        outputs = torch.zeros(target_len,batch_size,target_vocab_size).to(device)

        hidden, cell = self.encoder(source)
        
        # Grab the start token
        x = target[0]
        
        for t in range(1,target_len):

            output, hidden, cell = self.decoder(x,hidden,cell)
            
            outputs[t] = output
            
            best_guess = output.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess
        
        return outputs

Now we are ready to do the training!

In [17]:
# Training hyperparameters
num_epoc = 10
learning_rate = 0.0001
batch_size = 64

In [18]:
# Model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(src_txt.vocab)
input_size_decoder = len(trg_txt.vocab)
output_size = len(trg_txt.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

In [19]:
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src_seq),
    device=device
)

In [20]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, encoder_dropout).to(device)

In [21]:
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, decoder_dropout).to(device)

In [22]:
model = Seq2Seq(encoder_net,decoder_net).to(device)

In [23]:
pad_idx = trg_txt.vocab.stoi['<pad>']

In [24]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(),lr=learning_rate)

In [25]:
def save_checkpoint(state, filename='my_checkpoint_log_pai_templates_char.pth.tar'):
    print('saving checkpoint')
    torch.save(state,filename)

In [26]:
def load_checkpoint(checkpoint,model,optimizer):
    print('loading checkpoint:')
    #model.load_state_dict(checkpoint['state_dict'])
    #optimizer.load_state_dict(checkpoint['optimizer'])

In [27]:
def seq2seq_acc(out, targ, pad_idx=1):
    out = out.argmax(2)
    targ_len = targ.size()
    out_len= out.size()
    if targ_len>out_len: out  = F.pad(out,  (0,0,0,targ_len-out_len,0,0), value=pad_idx)
    if out_len>targ_len: targ = F.pad(targ, (0,out_len-targ_len,0,0), value=pad_idx)
    return (out==targ).float().mean()

In [None]:
import torch.nn.functional as F

max_length =50
best_valid_acc = 0

for epoch in range(num_epoc):
    print(f'Epoch [{epoch +1} / {num_epoc}]')
    
    model.eval()
    acc = []
    for batch_idx, batch in enumerate(valid_iterator):
        inp_data = batch.src_seq.to(device)
 
        target = batch.trg_seq.to(device)

        output = model(inp_data, target)
        
        accuracy = seq2seq_acc(output,target)
    
        acc.append(float(accuracy))
    
    model.train()
    
    valid_accuracy = sum(acc)/len(acc)
    print(f'valid_accuracy: {valid_accuracy}')
    
    if valid_accuracy > best_valid_acc:
        save_checkpoint(model)
        print(f'Saving Checkpoint with acc: {valid_accuracy}')
        best_valid_acc = valid_accuracy
        
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src_seq.to(device)
       
        target = batch.trg_seq.to(device)

        
        # output shape: (trg_len,batch_size,output_dim)
        output = model(inp_data, target)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output,target)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)
        optimizer.step()
        
        if debug_print: print(f'Training Loss: {loss}')
        
        #step +=1
    

        
        
        
        
        
        
    

Epoch [1 / 10]
valid_accuracy: 0.0
Epoch [2 / 10]
valid_accuracy: 0.16044573858380318
saving checkpoint
Saving Checkpoint with acc: 0.16044573858380318
Epoch [3 / 10]
valid_accuracy: 0.1945576909929514
saving checkpoint
Saving Checkpoint with acc: 0.1945576909929514
Epoch [4 / 10]
valid_accuracy: 0.2208641618490219
saving checkpoint
Saving Checkpoint with acc: 0.2208641618490219
Epoch [5 / 10]
valid_accuracy: 0.23837747983634472
saving checkpoint
Saving Checkpoint with acc: 0.23837747983634472
Epoch [6 / 10]
valid_accuracy: 0.2515052296221256
saving checkpoint
Saving Checkpoint with acc: 0.2515052296221256
Epoch [7 / 10]
valid_accuracy: 0.2680730801075697
saving checkpoint
Saving Checkpoint with acc: 0.2680730801075697
Epoch [8 / 10]
valid_accuracy: 0.27716164384037256
saving checkpoint
Saving Checkpoint with acc: 0.27716164384037256
Epoch [9 / 10]
valid_accuracy: 0.29550633672624826
saving checkpoint
Saving Checkpoint with acc: 0.29550633672624826


In [None]:
# Set to evaluation mode for inference
model.eval()

In [None]:
# Handy translate/inference function
def translate_sentence(model, sentence, src_vocab, trg_vocab, device, max_length=50):

    # Load src_vocab tokenizer
    spacy_ger = spacy.load("en")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, src_vocab.init_token)
    tokens.append(src_vocab.eos_token)

    # Go through each src_vocab token and convert to an index
    text_to_indices = [src_vocab.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
    
    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [src_txt.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()
            print(best_guess)

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == trg_vocab.vocab.stoi["<eos>"]:
            break

    translated_sentence = [trg_vocab.vocab.itos[idx] for idx in outputs]

    # remove start token
    return [translated_sentence[1:]][:1]

In [None]:
# Sample sentance to translate
translate_me = 'A group of men are loading cotton onto a truck'

In [None]:
tok_list_trans = translate_sentence(model,translate_me,src_txt,trg_txt,device,max_length)

In [None]:
decoded_translation = ' '.join(tok_list_trans[0])
decoded_translation = lt.replace('<unk>','')
decoded_translation =lt.replace('<eos>','')


In [None]:
print(decoded_translation)
