<a href="https://colab.research.google.com/github/dsgiitr/reading-group/blob/master/May2020/Discussion3/MixedPrecisionTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

import spacy
import numpy as np
import random
import math
import time

from torch.nn.utils.rnn import pad_sequence

In [0]:
#Dataset download from https://github.com/multi30k/dataset

#https://github.com/multi30k/dataset/blob/master/data/task1/raw/train.de.gz
#https://github.com/multi30k/dataset/blob/master/data/task1/raw/train.en.gz

In [0]:
file1=open('./WMT/train.en','r')
file2=open('./WMT/train.de','r')

In [0]:
def read_linewise_dataset(path):
    file=open(path)
    return file.read().split('\n')

In [0]:
english_path='./WMT/train.en'
german_path='./WMT/train.de'

english_dataset=read_linewise_dataset(english_path)
german_dataset=read_linewise_dataset(german_path)

In [0]:
#Analysing Dataset
english_dataset[0],german_dataset[0]

In [0]:
#Number of Datapoints
len(english_dataset),len(german_dataset)

In [0]:
#!pip install tokenizers

In [0]:
from tokenizers import BertWordPieceTokenizer

In [0]:
from transformers import BertTokenizer
tokenizer_eng=BertTokenizer.from_pretrained("bert-base-uncased")
enc=tokenizer_eng.encode("Hello There")

dec=tokenizer_eng.decode(enc)


In [0]:
from transformers import BertTokenizer
tokenizer_ger=BertTokenizer.from_pretrained("bert-base-german-cased",
                                       special_tokens=["[PAD]", "[UNK]", "[SOS]", "[EOS]", "[MASK]"])
enc=tokenizer_ger.encode("Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.")

dec=tokenizer_ger.decode(enc)

In [0]:
class Dataset:
    def __init__(self,sent_eng,sent_ger,tokenizer_eng,tokenizer_ger):
        self.sent_eng=sent_eng
        self.sent_ger=sent_ger
        self.tokenize_eng=tokenizer_eng
        self.tokenize_ger=tokenizer_ger
        
        self.processed_data_eng=[]
        self.processed_data_ger=[]
        
        for i in range(len(self.sent_eng)):
            self.processed_data_eng.append(self.tokenize_eng.encode(self.sent_eng[i]))   
            self.processed_data_ger.append(self.tokenize_ger.encode(self.sent_ger[i]))
        
        
    def __len__(self):
        return len(self.sent_eng)
    
    def __getitem__(self,item):
        sample=[torch.tensor(self.processed_data_eng[item]),torch.tensor(self.processed_data_ger[item])]
        return sample

In [0]:
dataset=Dataset(english_dataset,
                german_dataset,
               tokenizer_eng,
               tokenizer_ger)

In [0]:
def pad_collate(batch):
    
    (xx, yy) = zip(*batch)

    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)

    return xx_pad, yy_pad, x_lens, y_lens

In [0]:
from torch.utils.data import Dataset, DataLoader
data_loader = DataLoader(dataset=dataset_, batch_size=128, shuffle=True, collate_fn=pad_collate)

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True,batch_first=True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_len):

        embedded = self.dropout(self.embedding(src))
               
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len,batch_first=True,enforce_sorted=False)
                
        packed_outputs, hidden = self.rnn(packed_embedded)

        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs,batch_first=True) 
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        return outputs, hidden

In [0]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs, mask):

        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]

        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
  
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 

        attention = self.v(energy).squeeze(2)

        attention = attention.masked_fill(mask == 0, -1e10)
        
        return F.softmax(attention, dim = 1)

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim,batch_first=True)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        a = self.attention(hidden, encoder_outputs,mask)

        a = a.unsqueeze(1)

        weighted = torch.bmm(a, encoder_outputs)

        weighted = weighted.permute(1, 0, 2)
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
   
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        return prediction, hidden.squeeze(0), a.squeeze(1)

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device
        
    def create_mask(self, src):
        mask = (src != self.src_pad_idx)
        return mask
    
    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
             
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size,trg_len,trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src, src_len)
                
        input = trg[:,0]
        
        mask = self.create_mask(src)

        #mask = [batch size, src len]
                
        for t in range(1, trg_len):

            output, hidden, _ = self.decoder(input, hidden, encoder_outputs,mask)
          
            outputs[:,t,:] = output

            teacher_force = random.random() < teacher_forcing_ratio
          
            top1 = torch.max(output,1)[1] 
            input = trg[:,t] if teacher_force else top1
            
        return outputs

In [0]:
input_dim=len(tokenizer_eng.vocab)
output_dim=len(tokenizer_ger.vocab)

enc_emb_dim=256
dec_emb_dim=256

enc_hid_dim=200
dec_hid_dim=200

enc_dropout=0.3
dec_dropout=0.3

output_pad=0 
attention = Attention(enc_hid_dim, dec_hid_dim)
enc = Encoder(input_dim, enc_emb_dim, enc_hid_dim, dec_hid_dim, enc_dropout)
dec = Decoder(output_dim, dec_emb_dim, enc_hid_dim, dec_hid_dim, dec_dropout, attention)


model = Seq2Seq(enc, dec, output_pad, device).to(device)

In [0]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [0]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = output_pad)

In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch[0]
        src_len=batch[2]
        trg = batch[1]
        trg_len=batch[3]
        
        optimizer.zero_grad()
        
        output = model(src, src_len, trg)

        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
train_loss=train(model, data_loader, optimizer, criterion, 1)
print(train_loss)