<a href="https://colab.research.google.com/github/deepesh321/Transformer/blob/master/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import math
import spacy
import re
from torch.autograd import Variable
import pandas as pd
import torchtext
from torchtext import data
import os
import dill as pickle
import numpy as np
import torch.nn.functional as F
import copy
from nltk.corpus import wordnet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
BASE_DIR = '/content/drive/My Drive/'

In [None]:
!spacy download en && spacy download fr

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/fr_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/fr
You can now load the model via spacy.load('fr')


In [None]:
!ls /content/drive/'My Drive'/'Colab Notebooks'

In [None]:
class Args:
  src_data = '/content/drive/My Drive/Colab Notebooks/english.txt'
  trg_data = '/content/drive/My Drive/Colab Notebooks/french.txt'
  src_lang='en'
  trg_lang='fr'
  no_cuda=False
  SGDR=False
  epochs=20
  d_model=256
  n_layers=3
  heads=4
  dropout=0.1
  batchsize=100
  printevery=100
  lr=0.0001
  max_strlen=80
  load_weights=None

opt=Args()

In [None]:
class embedding(nn.Module):
  def __init__(self,vocab_size,d_model):
    super().__init__()
    self.d_model=d_model
    self.embed=nn.Embedding(vocab_size,d_model)

  def forward(self,x):
    return self.embed(x)

class positional_encoding(nn.Module):
  def __init__(self,d_model,max_len=200,dropout=0.1):
    super().__init__()
    self.d_model=d_model
    self.dropout=nn.Dropout(dropout)

    pe=torch.zeros(max_len,d_model)  #positional matrix
    for pos in range(max_len):
      for i in range(0,d_model,2):
        pe[pos,i]=math.sin(pos/(10000**(2*i/d_model)))
        if i+1< d_model:
          pe[pos,i+1]=math.cos(pos/(10000**(2*(i+1)/d_model)))

    pe=pe.unsqueeze(0)
    self.register_buffer('pe',pe)

  def forward(self,x):
    x=x*math.sqrt(self.d_model)  #make embedding relatively larger
    seq_len=x.size(1)
    pe=Variable(self.pe[:,:seq_len],requires_grad=False)
    if x.is_cuda:
      pe.to('cuda')
    x=x+pe
    return self.dropout(x)

In [None]:
class tokenize(object):

  def __init__(self,lang):
    self.nlp=spacy.load(lang)

  def tokenizer(self,sentence):
    sentence = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
    sentence = re.sub(r"[ ]+", " ", sentence)
    sentence = re.sub(r"\!+", "!", sentence)
    sentence = re.sub(r"\,+", ",", sentence)
    sentence = re.sub(r"\?+", "?", sentence)
    sentence = sentence.lower()
    return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

In [None]:
def read_data(opt):
  if opt.src_data is not None:
    try:
      opt.src_data=open(opt.src_data).read().strip().split('\n')
    except:
      print("error: '" + opt.src_data + "' file not found")
      quit()

  if opt.trg_data is not None:
    try:
      opt.trg_data=open(opt.trg_data).read().strip().split('\n')
    except:
      print("error: '" + opt.trg_data + "' file not found")
      quit()


def create_fields(opt):
  spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl']
  if opt.src_lang not in spacy_langs:
    print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs)  
  if opt.trg_lang not in spacy_langs:
    print('invalid src language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs)  

  print('loading spacy tokenizer')
  t_src=tokenize(opt.src_lang)
  t_trg=tokenize(opt.trg_lang)

  TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
  SRC = data.Field(lower=True, tokenize=t_src.tokenizer)

  if opt.load_weights is not None:
    try:
      print("loading presaved fields...")
      SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb'))
      TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb'))
    except:
      print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/")
      quit()
        
  return(SRC, TRG)


def create_dataset(opt,SRC,TRG):

  print('Create dataset and iterator...')
  raw_data={'src':[line for line in opt.src_data],'trg':[line for line in opt.trg_data]}
  df=pd.DataFrame(raw_data,columns=['src','trg'])

  mask=(df['src'].str.count(' ')< opt.max_strlen) & (df['trg'].str.count(' ')< opt.max_strlen)
  # print(mask)
  df=df.loc[mask]

  df.to_csv('temp.csv',index=False)
  data_fields=[('src',SRC),('trg',TRG)]

  train = data.TabularDataset('temp.csv', format='csv', fields=data_fields)
  # print(train)

  train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                        batch_size_fn=batch_size_fn, train=True, shuffle=True)
    
  # os.remove('temp.csv')

  if opt.load_weights is None:
    SRC.build_vocab(train)
    TRG.build_vocab(train)
    if opt.checkpoint > 0:
      try:
        os.mkdir("weights")
      except:
        print("weights folder already exists, run program with -load_weights weights to load them")
        quit()
      pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
      pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))

    opt.src_pad = SRC.vocab.stoi['<pad>']
    # print(opt.src_pad)
    opt.trg_pad = TRG.vocab.stoi['<pad>']

    opt.train_len = get_len(train_iter)

    return train_iter

def get_len(train):

  for i, b in enumerate(train):
    pass
    
  return i

In [None]:
class Norm(nn.Module):
  def __init__(self,d_model,eps=1e-6):
    super().__init__()

    self.size=d_model
    self.alpha=nn.Parameter(torch.ones(self.size))
    self.bias=nn.Parameter(torch.zeros(self.size))

    self.eps=eps

  def forward(self, x):
    norm = self.alpha * (x - x.mean(dim=-1, keepdim=True))/(x.std(dim=-1, keepdim=True) + self.eps) + self.bias
    return norm


def attention(q,k,v,d_k,mask=None,dropout=None):

  scores= torch.matmul(q,k.transpose(-2,-1))/math.sqrt(d_k)

  if mask is not None:
    mask=mask.unsqueeze(1)
    scores=scores.masked_fill(mask==0,-1e9)

  scores=F.softmax(scores,dim=1)

  if dropout is not None:
    scores=dropout(scores)

  output= torch.matmul(scores,v)
  return output

class MultiHeadAttention(nn.Module):
  def __init__(self,heads,d_model,dropout=0.1):
    super().__init__()

    self.d_model=d_model
    self.d_k=d_model//heads
    self.h=heads

    self.q_linear=nn.Linear(d_model,d_model)
    self.v_linear=nn.Linear(d_model,d_model)
    self.k_linear=nn.Linear(d_model,d_model)
    
    self.dropout=nn.Dropout(dropout)
    self.out=nn.Linear(d_model,d_model)

  def forward(self,q,k,v,mask=None):

    bs=q.size(0)
    k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
    q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
    v = self.v_linear(v).view(bs, -1, self.h, self.d_k)

    k = k.transpose(1,2)
    q = q.transpose(1,2)
    v = v.transpose(1,2)

    scores=attention(q,k,v,self.d_k,mask,self.dropout)
    concat=scores.transpose(1,2).contiguous().view(bs,-1,self.d_model)

    output=self.out(concat)
    return output


class FeedForward(nn.Module):
  def __init__(self,d_model,d_ff=2048,dropout=0.1):
    super().__init__()

    self.linear_1=nn.Linear(d_model,d_ff)
    self.dropout=nn.Dropout(dropout)
    self.linear_2=nn.Linear(d_ff,d_model)

  def forward(self, x):
    x = self.linear_2(self.dropout(F.relu(self.linear_1(x))))
    return x


In [None]:
class EncoderLayer(nn.Module):
  def __init__(self,d_model,heads,dropout=0.1):

    super().__init__()
    self.norm_1=Norm(d_model)
    self.norm_2=Norm(d_model)
    self.attn=MultiHeadAttention(heads,d_model,dropout=dropout)
    self.ff = FeedForward(d_model, dropout=dropout)
    self.dropout_1 = nn.Dropout(dropout)
    self.dropout_2 = nn.Dropout(dropout)

  def forward(self,x,mask):

    x2=self.norm_1(x)
    x=x+self.dropout_1(self.attn(x2,x2,x2,mask))
    x2=self.norm_2(x)
    x=x+self.dropout_2(self.ff(x2))
    return x

class DecoderLayer(nn.Module):
  def __init__(self,d_model,heads,dropout=0.1):

    super().__init__()

    self.norm_1 = Norm(d_model)
    self.norm_2 = Norm(d_model)
    self.norm_3 = Norm(d_model)
    
    self.dropout_1 = nn.Dropout(dropout)
    self.dropout_2 = nn.Dropout(dropout)
    self.dropout_3 = nn.Dropout(dropout)
    
    self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
    self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
    self.ff = FeedForward(d_model, dropout=dropout)

  def forward(self,x,e_outputs,src_mask,trg_mask):

    x2=self.norm_1(x)
    x=x+self.dropout_1(self.attn_1(x2,x2,x2,trg_mask))
    x2=self.norm_2(x)
    x=x+self.dropout_2(self.attn_2(x2,e_outputs,e_outputs,src_mask))
    x2=self.norm_3(x)
    x = x + self.dropout_3(self.ff(x2))
    return x

In [None]:
def get_clones(module,N):
  return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Encoder(nn.Module):
  def __init__(self,vocab_size,d_model,N,heads,dropout):
    super().__init__()

    self.N=N
    self.embed = embedding(vocab_size,d_model)
    self.pe = positional_encoding(d_model,dropout=dropout)
    self.layers=get_clones((EncoderLayer(d_model, heads, dropout)), N)
    self.norm=Norm(d_model)

  def forward(self,src,mask):
    x=self.embed(src)
    x=self.pe(x)
    for i in range(self.N):
      x=self.layers[i](x,mask)
    return self.norm(x)

class Decoder(nn.Module):
  def __init__(self,vocab_size,d_model,N,heads,dropout):
    super().__init__()
    
    self.N=N
    self.embed=embedding(vocab_size,d_model)
    self.pe = positional_encoding(d_model,dropout=dropout)
    self.layers=get_clones((DecoderLayer(d_model, heads, dropout)), N)
    self.norm=Norm(d_model)

  def forward(self,trg,e_outputs,src_mask,trg_mask):
    x=self.embed(trg)
    x=self.pe(x)
    for i in range(self.N):
      x=self.layers[i](x,e_outputs,src_mask,trg_mask)
    return self.norm(x)

class Transformer(nn.Module):
  def __init__(self,src_vocab,trg_vocab,d_model,N,heads,dropout):
    super().__init__()

    self.encoder=Encoder(src_vocab,d_model,N,heads,dropout)
    self.decoder=Decoder(trg_vocab,d_model,N,heads,dropout)
    self.out=nn.Linear(d_model,trg_vocab)

  def forward(self,src,trg,src_mask,trg_mask):

    e_outputs = self.encoder(src, src_mask)
    d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
    output = self.out(d_output)
    return output

def get_model(opt, src_vocab, trg_vocab):

  assert opt.d_model%opt.heads==0
  assert opt.dropout < 1

  model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads, opt.dropout)

  if opt.load_weights is not None:
    print('Loading pretrained weights')
    model.load_state_dict(torch.load(f'{opt.load_weights}/model_weights'))

  else:
    for p in model.parameters():
      if p.dim()>1:
        nn.init.xavier_uniform_(p)

  if opt.device==0:
    model=model.cuda()

  return model

In [None]:
def nopeak_mask(size,opt):
  np_mask=np.triu(np.ones((1,size,size)),k=1).astype('uint8')
  np_mask=Variable(torch.from_numpy(np_mask)==0)
  if opt.device==0:
    np_mask=np_mask.cuda()
  return np_mask

def create_masks(src,trg,opt):
  src_mask=(src!=opt.src_pad).unsqueeze(-2)

  if trg is not None:
    trg_mask = (trg != opt.trg_pad).unsqueeze(-2)
    size=trg.size(1)
    np_mask=nopeak_mask(size,opt)
    if trg.is_cuda:
      np_mask.cuda()
    trg_mask=trg_mask & np_mask

  else:
    trg_mask=None
  return src_mask,trg_mask

class MyIterator(data.Iterator):
  def create_batches(self):
    if self.train:
      def pool(d, random_shuffler):
        for p in data.batch(d, self.batch_size * 100):
          p_batch = data.batch(sorted(p, key=self.sort_key),self.batch_size, self.batch_size_fn)
          for b in random_shuffler(list(p_batch)):
            yield b
      self.batches = pool(self.data(), self.random_shuffler)
          
    else:
      self.batches = []
      for b in data.batch(self.data(), self.batch_size,self.batch_size_fn):
        self.batches.append(sorted(b, key=self.sort_key))

global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
  global max_src_in_batch, max_tgt_in_batch
  if count == 1:
    max_src_in_batch = 0
    max_tgt_in_batch = 0
  max_src_in_batch = max(max_src_in_batch,len(new.src))
  max_tgt_in_batch = max(max_tgt_in_batch,len(new.trg) + 2)
  src_elements = count * max_src_in_batch
  tgt_elements = count * max_tgt_in_batch
  return max(src_elements, tgt_elements)

In [None]:
opt.device=-1
opt.checkpoint=0
read_data(opt)
SRC, TRG = create_fields(opt)
opt.train = create_dataset(opt, SRC, TRG)
model = get_model(opt, len(SRC.vocab), len(TRG.vocab))
opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)

loading spacy tokenizer
Create dataset and iterator...


The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [None]:
import time
model.train()
    
start = time.time()
temp = start

total_loss = 0

for epoch in range(opt.epochs):
    
  for i, batch in enumerate(opt.train):
    src = batch.src.transpose(0,1)
    trg = batch.trg.transpose(0,1)
    
    trg_input = trg[:, :-1]
    targets = trg[:, 1:].contiguous().view(-1)
    
    
    # create function to make masks using mask code above
    
    src_mask, trg_mask = create_masks(src, trg_input,opt)
    preds = model(src, trg_input, src_mask, trg_mask)

    opt.optimizer.zero_grad()
    
    loss = F.cross_entropy(preds.view(-1, preds.size(-1)),targets, ignore_index=opt.trg_pad)
    loss.backward()
    opt.optimizer.step()
    
    total_loss += loss.item()
    if (i + 1) % opt.printevery == 0:
      loss_avg = total_loss / opt.printevery
      print("time = %dm, epoch %d, iter = %d, loss = %.3f,%ds per %d iters" % ((time.time() - start) // 60,epoch + 1, i + 1, loss_avg, time.time() - temp,opt.printevery))
      total_loss = 0
      temp = time.time()

In [None]:
def translate(model, src, max_len = 80):
    
    model.eval()
    src = tokenize_en(src)
    sentence=Variable(torch.LongTensor([[EN_TEXT.vocab.stoi[tok] for tok in sentence]])).cuda()
    src_mask = (src != input_pad).unsqueeze(-2)
    e_outputs = model.encoder(src, src_mask)
    outputs = torch.zeros(max_len).type_as(src.data)
    outputs[0] = torch.LongTensor([FR_TEXT.vocab.stoi['<sos>']])
            
    for i in range(1, max_len):    
      trg_mask = np.triu(np.ones((1, i, i),
      k=1).astype('uint8')
      trg_mask= Variable(torch.from_numpy(trg_mask) == 0).cuda()

      out = model.out(model.decoder(outputs[:i].unsqueeze(0),e_outputs, src_mask, trg_mask))
      out = F.softmax(out, dim=-1)
      val, ix = out[:, -1].data.topk(1)

      outputs[i] = ix[0][0]
      if ix[0][0] == FR_TEXT.vocab.stoi['<eos>']:
        break
    return ' '.join([FR_TEXT.vocab.itos[ix] for ix in outputs[:i]])