In [1]:
#Pre-Processing
import os
import re
import torch
import random
import itertools

#Model
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np

# For visualising metrics
from visdom import Visdom

# For visualising gradients plot
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

import copy
import math
import time

In [2]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("The device found: "+str(device))

The device found: cpu


In [3]:
class VisdomLinePlotter(object):
    """Plots to Visdom"""
    
    def __init__(self, env_name='main'):
        self.viz = Visdom()
        self.env = env_name
        self.plots = {}
    def plot(self, var_name, split_name, title_name, x, y):
        if var_name not in self.plots:
            self.plots[var_name] = self.viz.line(X=np.array([x,x]), Y=np.array([y,y]), env=self.env, opts=dict(
                legend=[split_name],
                title=title_name,
                xlabel='Epochs',
                ylabel=var_name
            ))
        else:
            self.viz.line(X=np.array([x]), Y=np.array([y]), env=self.env, win=self.plots[var_name], name=split_name, update = 'append')

In [4]:
def plot_grad_flow(named_parameters):
    """
        Plotting gradient flow across various layers
        Thanks to: https://discuss.pytorch.org/t/check-gradient-flow-in-network/15063/2
    """   
    ave_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean())
    plt.plot(ave_grads, alpha=0.3, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(xmin=0, xmax=len(ave_grads))
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)

# Preprocessing

In [5]:
path='C:\\Users\\deepa\\Conversational Agents\\Datasets'
dataset='cornell movie-dialogs corpus'

data_folder=os.path.join(path,dataset)

print("The final data corpus folder: "+str(data_folder))

The final data corpus folder: C:\Users\deepa\Conversational Agents\Datasets\cornell movie-dialogs corpus


In [6]:
def get_lines_conversations():
    """
    Loads movie lines and conversations from the dataset.
    
    data_folder: Destination where conversations and lines are stored.
    
    movie_lines: Consist of movie lines as given by the dataset.
    movie_conversations: Consist of movie conversations as given by the dataset.
    
    """
    f=open(os.path.join(data_folder,'movie_lines.txt'),'r')
    movie_lines=f.read().splitlines()
    f.close()
    
    f=open(os.path.join(data_folder,'movie_conversations.txt'),'r')
    movie_conversations=f.read().splitlines()
    f.close()
    
    return movie_lines,movie_conversations


In [7]:
t1=time.time()
print("Extracting movie lines and movie conversations...")
movie_lines,movie_conversations=get_lines_conversations()

print("Number of distinct lines: "+str(len(movie_lines)))
print("Number of conversations: "+str(len(movie_conversations)))
print("Average Number of lines per conversations: "+str(len(movie_lines)/len(movie_conversations)))

print(movie_lines[0])
print(movie_conversations[0])

print("Extracting took place in: "+str(time.time()-t1))

Extracting movie lines and movie conversations...
Number of distinct lines: 304713
Number of conversations: 83097
Average Number of lines per conversations: 3.6669554857576085
L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
Extracting took place in: 0.47766828536987305


In [8]:
def loadLines(movie_lines,fields):
    lines={}
    for line in movie_lines:
        values=line.split(" +++$+++ ")
        
        lineVals={}
        
#         print("values"+str(len(values)))
#         print("fields"+str(len(fields)))
              
        for i,field in enumerate(fields):
            lineVals[field]=values[i]
        
        lines[lineVals['lineID']]=lineVals
    
    return lines

def loadConversations(movie_conversations,lines,fields):
    conversations=[]
    
    for convo in movie_conversations:
        values=convo.split(" +++$+++ ")
        conVals={}
       
        for i,field in enumerate(fields):
            conVals[field]=values[i]
        
        lineIDs=eval(conVals["utteranceIDs"])
        
        conVals["lines"]=[]
        
        for lineID in lineIDs:
            conVals["lines"].append(lines[lineID])
        conversations.append(conVals)
        
    return conversations

def sentencePairs(conversations):
    qr_pairs=[]
    
    for conversation in conversations:
        for i in range(len(conversation["lines"])-1):
            query=conversation["lines"][i]["text"].strip()
            response=conversation["lines"][i+1]["text"].strip()
            
            if query and response:
                qr_pairs.append([query,response])
        
    return qr_pairs

In [9]:
t1=time.time()
print("Separating meaningfull information for our model...")

lines={}
conversations=[]
qr_pairs=[]

movie_lines_fields=["lineID","characterID","movieID","character","text"]
movie_convo_fields=["charcaterID","character2ID","movieID","utteranceIDs"]

lines=loadLines(movie_lines,movie_lines_fields)
conversations=loadConversations(movie_conversations,lines,movie_convo_fields)
qr_pairs=sentencePairs(conversations)

print("The number of query-response pairs are: "+str(len(qr_pairs)))
print("Separation took place in: "+str(time.time()-t1))


Separating meaningfull information for our model...
The number of query-response pairs are: 221282
Separation took place in: 5.064160585403442


In [10]:
PAD_Token=0
START_Token=1
END_Token=2

class Vocabulary:
    def __init__(self):
        self.trimmed=False
        self.word2count={}
        self.index2word={PAD_Token:"PAD",START_Token:"SOS",END_Token:"EOS"}
        self.word2index={"PAD":PAD_Token,"SOS":START_Token,"EOS":END_Token}
        self.num_words=3
        
    def addSentence(self,sentence):
        for word in sentence.split(" "):
            self.addWord(word)
    def addWord(self,word):
        if word not in self.word2index:
            self.word2index[word]=self.num_words
            self.index2word[self.num_words]=word
            self.word2count[word]=1
            self.num_words=self.num_words+1
        else:
            self.word2count[word]+=1
            
    def trim(self,min_count):
        
        if self.trimmed:
            return
        self.trimmed=True
        
        keep_words=[]
        
        for word,freq in self.word2count.items():
            if freq>=min_count:
                keep_words.append(word)
        
        self.word2count={}
        self.index2word={PAD_Token:"PAD",START_Token:"SOS",END_Token:"EOS"}
        self.word2index={"PAD":PAD_Token,"SOS":START_Token,"EOS":END_Token}
        self.num_words=3
        
        for word in keep_words:
            self.addWord(word)

In [11]:
Max_Length=10

def normalizeString(s):
    s=s.lower().strip()
    s=re.sub(r"([.!?])", r" \1", s)
    s=re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s=re.sub(r"\s+", r" ", s).strip()
    return s

def readVocs(qr_pairs):
    
    for qr_pair in qr_pairs:
        qr_pair[0]=normalizeString(qr_pair[0])
        qr_pair[1]=normalizeString(qr_pair[1])
    
    voc=Vocabulary()
    return voc,qr_pairs

def filterPair(pair):
    return len(pair[0].split(" "))<Max_Length and len(pair[1].split(" "))<Max_Length

def filterPairs(qr_pairs):
    return [pair for pair in qr_pairs if filterPair(pair)]

def prepareDataset(qr_pairs):
    voc, qr_pairs=readVocs(qr_pairs)
    qr_pairs=filterPairs(qr_pairs)
       
    for pair in qr_pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
#     print("Number"+str(voc.num_words))
    return voc,qr_pairs

t1=time.time()
print("Preparing dataset and corresponding vocabulary...")
voc, pairs=prepareDataset(qr_pairs)
print("Preparation took place in: "+str(time.time()-t1))

Preparing dataset and corresponding vocabulary...
Preparation took place in: 17.518337965011597


In [12]:
Min_Count=3

def trimRareWords(voc,qr_pairs):
    
    voc.trim(Min_Count)
    keep_pairs=[]
    
    for pair in qr_pairs:
        input_sentence=pair[0]
        output_sentence=pair[1]
        
        keep_input=True
        keep_output=True
        
        for word in input_sentence.split(" "):
            if word not in voc.word2index:
                keep_input=False
                break
        
        for word in output_sentence.split(" "):
            if word not in voc.word2index:
                keep_output=False
                break
                
        if keep_input and keep_output:
            keep_pairs.append(pair)
            
    return keep_pairs

t1=time.time()
print("Trimming rare words from vocabulary and dataset..")

pairs=trimRareWords(voc,pairs)

print("Trimming took place in: "+str(time.time()-t1))


Trimming rare words from vocabulary and dataset..
Trimming took place in: 0.39127397537231445


In [13]:
def indexesFromSentence(voc,sentence):
    tokenised_sentence=[]
    tokenised_sentence.append(START_Token)
    
    for word in sentence.split(" "):
        tokenised_sentence.append(voc.word2index[word])
        
    tokenised_sentence.append(END_Token)
    
    assert len(tokenised_sentence)<=Max_Length+2
    for _ in range(Max_Length+2-len(tokenised_sentence)):
        tokenised_sentence.append(PAD_Token)
        
    return tokenised_sentence

def binaryMatrix(l,value=PAD_Token):
    m=[]
    for i,seq in enumerate(l):
        m.append([])
        for token in seq:
            if token==value:
                m[i].append(0)
            else:
                m[i].append(1)
        
    return m

def inputVar(voc,l):
    
    indexes_batch=[indexesFromSentence(voc,sentence) for sentence in l]
    input_lengths=torch.tensor([len(index) for index in indexes_batch])
    padVar=torch.LongTensor(indexes_batch)
    return input_lengths,padVar

def outputVar(voc,l):
    indexes_batch=[indexesFromSentence(voc,sentence) for sentence in l]
    max_target_len=torch.tensor([len(index) for index in indexes_batch])
    mask=binaryMatrix(indexes_batch)
    mask=torch.ByteTensor(mask)
    padVar=torch.LongTensor(indexes_batch)
    return max_target_len, mask, padVar

def batch2TrainData(voc,pair_batch):
    #sort function see 
    input_batch=[]
    output_batch=[]

    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
                                  
    
    input_lengths,tokenised_input=inputVar(voc,input_batch)
    max_out_length,mask,tokenised_output=outputVar(voc,output_batch)
    return input_lengths,tokenised_input,max_out_length,mask,tokenised_output



In [14]:
print("Number of query-response pairs after all the preprocessing: "+str(len(pairs)))

#Sample batch
batch=[random.choice(pairs) for _ in range(5)]
input_lengths,tokenised_input,max_out_length,mask,tokenised_output=batch2TrainData(voc,batch)

print("Input length: "+str(input_lengths)+" Size: "+str(input_lengths.shape))
print("-"*80)
print("Tokenised Input: "+str(tokenised_input)+" Size: "+str(tokenised_input.shape))
print("-"*80)
print("Max out length: "+str(max_out_length)+" Size: "+str(max_out_length.shape))
print("-"*80)
print("Mask: "+str(mask)+" Size: "+str(mask.shape))
print("-"*80)
print("Tokenised Output: "+str(tokenised_output)+" Size: "+str(tokenised_output.shape))
print("-"*80)

Number of query-response pairs after all the preprocessing: 53113
Input length: tensor([12, 12, 12, 12, 12]) Size: torch.Size([5])
--------------------------------------------------------------------------------
Tokenised Input: tensor([[   1,   33,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [   1,   76,   37,   67,   12,  465, 1300,    4,    2,    0,    0,    0],
        [   1, 1425,   66,    2,    0,    0,    0,    0,    0,    0,    0,    0],
        [   1,  562,    4,    7, 1095,   45,  129,    6,    2,    0,    0,    0],
        [   1, 5302,    4,    2,    0,    0,    0,    0,    0,    0,    0,    0]]) Size: torch.Size([5, 12])
--------------------------------------------------------------------------------
Max out length: tensor([12, 12, 12, 12, 12]) Size: torch.Size([5])
--------------------------------------------------------------------------------
Mask: tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       

# Model 1


In [15]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [16]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [17]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [18]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [19]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [20]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [21]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [22]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [23]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [24]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [25]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [26]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [27]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [28]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [29]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [30]:
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

# Model 2

In [106]:
class EncoderDecoder(nn.Module):
    
    def __init__(self,encoder,decoder,source_embed,target_embed,generator):
        super().__init__()
        
        self.encoder=encoder
        self.decoder=decoder
        
        self.source_embed=source_embed
        self.target_embed=target_embed
        
        self.generator=generator # Linear + Log_softmax
        
    def forward(self,source,target,source_mask,target_mask):
        return self.decode(self.encode(source,source_mask),source_mask,target,target_mask)
    
    def encode(self,source,source_mask):
        return self.encoder(self.source_embed(source),source_mask)
    
    def decode(self,memory, source_mask,target,target_mask):
        return self.decoder(self.target_embed(target),memory,source_mask,target_mask)
    

In [107]:
class Generator(nn.Module):
    
    def __init__(self,d_model,vocab_size):
        super().__init__()
        self.projection=nn.Linear(d_model,vocab_size)
        
    def forward(self,decoder_output):
        return F.log_softmax(self.projection(decoder_output),dim=-1)
    

In [108]:
def clones(module,N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [109]:
class Encoder(nn.Module):
    
    def __init__(self,layer,N):
        super().__init__()
        
        self.layers=clones(layer,N)
        self.norm=LayerNorm(layer.size)
    
    def forward(self,x,mask):
        
        for layer in self.layers:
            x=layer(x,mask)
        
        return self.norm(x)

In [110]:
class EncoderLayer(nn.Module):
    def __init__(self,size,self_attn,feed_forward,dropout):
        super().__init__()
        
        self.attn=self_attn
        self.feed_forward=feed_forward
        self.sublayer=clones(SublayerConnection(size,dropout),2)
        self.size=size
        
    def forward(self,x,mask):
        
        x=self.sublayer[0](x,lambda x: self.attn(x,x,x,mask))
        return self.sublayer[1](x,self.feed_forward)
        

In [111]:
class LayerNorm(nn.Module):
    
    def __init__(self,features,eps=1e-6):
        super().__init__()
        self.a_2=nn.Parameter(torch.ones(features))
        self.b_2=nn.Parameter(torch.zeros(features))
        self.eps=eps
        
    def forward(self,x):
        mean=x.mean(-1,keepdim=True)
        std=x.std(-1,keepdim=True)
        return self.a_2*(x-mean)/(x+std)+self.b_2

In [112]:
class SublayerConnection(nn.Module):
    
    def __init__(self,size,dropout):
        super().__init__()
        
        self.dropout=nn.Dropout(dropout)
        self.norm=LayerNorm(size)
        
    def forward(self,x,sublayer):
        return x+self.dropout(sublayer(self.norm(x)))
    

In [113]:
class Decoder(nn.Module):
    
    def __init__(self,layer,N):
        super().__init__()
        
        self.layers=clones(layer,N)
        self.norm=LayerNorm(layer.size)
    
    def forward(self,x,memory,curr_mask,tgt_mask):
        
        for layer in self.layers:
            x=layer(x,memory,curr_mask,tgt_mask)
            
        return self.norm(x)
    

In [114]:
class DecoderLayer(nn.Module):
    
    def __init__(self,size,self_attn,src_attn,feed_forward,dropout):
        super().__init__()
        
        self.size=size
        self.self_attn=self_attn
        self.src_attn=src_attn
        self.feed_forward=feed_forward
        
        self.sublayer=clones(SublayerConnection(size,dropout),3)
        
    def forward(self,x,memory,src_mask,tgt_mask):
        
        m=memory
        x=self.sublayer[0](x,lambda x:self.self_attn(x,x,x,tgt_mask))
        x=self.sublayer[1](x,lambda x: self.src_attn(x,m,m,src_mask))
        return self.sublayer[2](x,self.feed_forward)
        

In [115]:
class MultiHeadedAttention(nn.Module):
    
    def __init__(self,h,d_model,dropout=0.1):
        super().__init__()
        
        assert d_model%h==0
        
        self.d_k=d_model//h
        self.h=h
        self.linears=clones(nn.Linear(d_model,d_model),4)
        self.attn=None
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,query,key,values,mask=None):
        
        if mask is not None:
            mask=mask.unsqueeze(1)
            
        nbatches=query.size(0)
        
        query,key,values=[l(x).view(nbatches,-1,self.h,self.d_k).transpose(1,2) for l, x in zip(self.linears,(query,key,values))]
        
        x,self.attn=attention(query,key,values,mask=mask,dropout=self.dropout)
        
        x=x.transpose(1,2).contiguous().view(nbatches,-1,self.h*self.d_k)
        
        return self.linears[-1](x)
        

In [116]:
def attention(query,key,value,mask=None,dropout=None):
    
    d_k=query.size(-1)

    scores=torch.matmul(query,key.transpose(-2,-1))/math.sqrt(d_k)
    
    if mask is not None:
        scores=scores.masked_fill(mask==0,-1e9)
        
    p_attn=F.softmax(scores,dim=-1)
    
    if dropout is not None:
        p_attn=dropout(p_attn)
        
    return torch.matmul(p_attn,value),p_attn
    

In [117]:
class PositionwiseFeedForward(nn.Module):
    
    def __init__(self,d_model,d_ff,dropout=0.1):
        super().__init__()
        
        self.w_1=nn.Linear(d_model,d_ff)
        self.w_2=nn.Linear(d_ff,d_model)
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))
    

In [118]:
class Embeddings(nn.Module):
    
    def __init__(self,d_model,vocab):
        super().__init__()
        
        self.embed=nn.Embedding(vocab,d_model)
        self.d_model=d_model
    
    def forward(self,x):
        return self.embed(x)*math.sqrt(self.d_model)


In [119]:
class PositionalEncoding(nn.Module):
    
    def __init__(self,d_model,dropout,max_len=5000):
        super().__init__()
        
        self.dropout=nn.Dropout(dropout)
        pe=torch.zeros(max_len,d_model,dtype=torch.float)
        position=torch.arange(0.,max_len).unsqueeze(1)
        div_term=torch.exp(torch.arange(0.,d_model,2)*-(math.log(10000.0)/d_model))
        
        pe[:,0::2]=torch.sin(position*div_term)
        pe[:,1::2]=torch.cos(position*div_term)
        
        pe=pe.unsqueeze(0)
        self.register_buffer('pe',pe)
        
    def forward(self,x):
        
        x=x+Variable(self.pe[:,:x.size(1)],requires_grad=False)
        return self.dropout(x)
        

In [120]:
def make_model2(src_vocab,tgt_vocab,N=6,d_model=512,d_ff=2048,h=8,dropout=0.1):
    
    c=copy.deepcopy
    attn=MultiHeadedAttention(h,d_model)
    ff=PositionwiseFeedForward(d_model,d_ff,dropout)
    position=PositionalEncoding(d_model,dropout)
    model=EncoderDecoder(Encoder(EncoderLayer(d_model,c(attn),c(ff),dropout),N),
                        Decoder(DecoderLayer(d_model,c(attn),c(attn),c(ff),dropout),N),
                        nn.Sequential(Embeddings(d_model,src_vocab),c(position)),
                        nn.Sequential(Embeddings(d_model,tgt_vocab),c(position)),
                        Generator(d_model,tgt_vocab))
    
    for p in model.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    return model

In [121]:
sample_model=make_model(voc.num_words,voc.num_words,1,512,2048,8,0.1)
# print(sample_model)



In [35]:
#Sample Run
source=torch.ones(5,12,dtype=torch.long)
target=torch.ones(5,12,dtype=torch.long)
source_mask=None
target_mask=torch.ones(5,12,12,dtype=torch.long)
out=sample_model(source,target,source_mask,target_mask)
print("-"*80)
print("Output size: "+str(out.shape))
print("-"*80)

--------------------------------------------------------------------------------
Output size: torch.Size([5, 12, 512])
--------------------------------------------------------------------------------


In [45]:
"""
triu function generates a copy of matrix with elemens below kth diagonal zeroed.
The main diagonal is zeroeth diagonal above is first(k=1) and so on.

Eg:
A=[[1,2,3],[4,5,6],[7,8,9]]
for above matrix:
triu(A,k=1)
will give [[0,2,3],[0,0,6],[0,0,0]]
"""

def subsequent_mask(size):
    attn_shape=(1,size,size)
    mask=np.triu(np.ones(attn_shape),k=1).astype('uint8')
    
    return torch.from_numpy(mask)==0

# Training

In [32]:
def data_generation(pairs,batch_size,n_batches):
    
    sample_batches=[batch2TrainData(voc,[random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_batches)]
    batches=[]
    
    for i in range(n_batches):
        batches.append(Batch(sample_batches[i],PAD_Token))
    
    return batches

In [75]:
class Batch:
    
    def __init__(self,sample_batch,pad):
        
        self.src=sample_batch[1]
        self.src_mask=self.make_src_mask(self.src,pad)
        self.trg=sample_batch[-1][:,:-1]
        self.trg_mask=self.make_trg_mask(self.trg,pad)
        self.trg_y=sample_batch[-1][:,1:]
        self.ntokens=(self.trg_y!=pad).data.sum()
        
    @staticmethod
    def make_src_mask(src,pad):
        return (src!=pad).unsqueeze(-2)
    @staticmethod    
    def make_trg_mask(trg,pad):
        trg_mask=(trg!=pad).unsqueeze(-2)
        trg_mask=trg_mask&Variable(subsequent_mask(trg.size(-1)).type_as(trg_mask.data))
        return trg_mask
        

In [76]:
def run_epoch(data,model,loss_compute):
    
    start_time=time.time()
    total_tokens=0
    total_loss=0
    tokens=0
    
    out=model(data.src,data.trg,data.src_mask,data.trg_mask)
    loss=loss_compute(out,data.trg_y,data.ntokens)
    
    return loss
  

In [77]:
x=torch.tensor([[[4,2],[3,2]],[[2,4],[5,7]]],dtype=torch.float)
trg=torch.tensor([[1,1],[0,0]])
print(x.shape)
print(trg.shape)
print(F.softmax(x,dim=-1))
y=F.softmax(x,dim=-1)
loss=0
for i in range(trg.size()[0]):
    for j in range(trg.size()[1]):
        target=trg[i][j]
        currVal=-torch.log(y[i][j][target])
        loss+=currVal
        print(currVal)
print(loss)
    
    


torch.Size([2, 2, 2])
torch.Size([2, 2])
tensor([[[0.8808, 0.1192],
         [0.7311, 0.2689]],

        [[0.1192, 0.8808],
         [0.1192, 0.8808]]])
tensor(2.1269)
tensor(1.3133)
tensor(2.1269)
tensor(2.1269)
tensor(7.6940)


In [78]:
def customLossFunction(outputs,target):
    batch_size=outputs.size()[0]
    numberOfWords=outputs.size()[1]
    outputs=F.softmax(outputs,dim=-1)
    loss=0
    normalisingVal=0
#     print(outputs)
#     print(target)
    for i in range(batch_size):
        for j in range(numberOfWords):
            trg=target[i][j]
            if trg!=0:
                
                currLoss=-(outputs[i][j][trg]+5)
                loss+=currLoss
                normalisingVal+=1
    return loss/normalisingVal

In [79]:
x=torch.tensor([[[4,2],[3,2]],[[2,4],[5,7]]],dtype=torch.float)
trg=torch.tensor([[1,1],[0,0]])
print(customLossFunction(x,trg))

tensor(-5.1941)


In [80]:
# class LabelSmoothing(nn.Module):
    
#     def __init__(self):
#         super().__init__()
#         self.criteria=customLossFunction()
#     def forward(self,x,target):
#         return self.criteria(x,target)
# class LabelSmoothing(nn.Module):
#     "Implement label smoothing."
#     def __init__(self, size, padding_idx, smoothing=0.0):
#         super(LabelSmoothing, self).__init__()
#         self.criterion = nn.KLDivLoss(size_average=False)
#         self.padding_idx = padding_idx
#         self.confidence = 1.0 - smoothing
#         self.smoothing = smoothing
#         self.size = size
#         self.true_dist = None
        
#     def forward(self, x, target):
#         assert x.size(1) == self.size
#         true_dist = x.data.clone()
#         true_dist.fill_(self.smoothing / (self.size - 2))
#         true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
#         true_dist[:, self.padding_idx] = 0
#         mask = torch.nonzero(target.data == self.padding_idx)
#         if mask.dim() > 0:
#             true_dist.index_fill_(0, mask.squeeze(), 0.0)
#         self.true_dist = true_dist
#         return self.criterion(x, Variable(true_dist, requires_grad=False))

In [81]:
class LossCompute:
    
    def __init__(self,model,opt):
        
        self.opt=opt
        self.model=model
    
    def __call__(self,x,y,norm):
        
        x=self.model.generator(x)
        loss=customLossFunction(x,y)
        
    
        
        loss.backward()
        
        _=nn.utils.clip_grad_norm_(model.parameters(),50.0)
        
#         plot_grad_flow(self.model.named_parameters())
        
        self.opt.step()
        self.opt.zero_grad()
        
        return loss.item()
        
        

In [91]:
batches=data_generation(pairs,5,10)

In [92]:
print("Initialising and creating models....")
t1=time.time()
# criterion=LabelSmoothing()
model=make_model(voc.num_words,voc.num_words)
model_opt=torch.optim.Adam(model.parameters(),lr=0.0001,betas=(0.9,0.988),eps=1e-9)
print("="*100)
print("Creating Models took: "+str(time.time()-t1))



model.train()
for epoch in range(1000):
    
    current_batch=batches[epoch%10]
    loss_val=run_epoch(current_batch,model,LossCompute(model,model_opt))
    print("Epoch: "+str(epoch)+" Loss Value: "+str(loss_val))
    
    

Initialising and creating models....




Creating Models took: 0.7191314697265625
Epoch: 0 Loss Value: -5.000136375427246
Epoch: 1 Loss Value: -5.000407695770264
Epoch: 2 Loss Value: -5.001376152038574
Epoch: 3 Loss Value: -5.003411293029785
Epoch: 4 Loss Value: -5.000217437744141
Epoch: 5 Loss Value: -5.0172905921936035
Epoch: 6 Loss Value: -5.01046895980835
Epoch: 7 Loss Value: -5.015929222106934
Epoch: 8 Loss Value: -5.00731086730957
Epoch: 9 Loss Value: -5.012537002563477
Epoch: 10 Loss Value: -5.027617931365967
Epoch: 11 Loss Value: -5.045120716094971
Epoch: 12 Loss Value: -5.033352851867676
Epoch: 13 Loss Value: -5.031441688537598
Epoch: 14 Loss Value: -5.000166893005371
Epoch: 15 Loss Value: -5.067634105682373
Epoch: 16 Loss Value: -5.02806282043457
Epoch: 17 Loss Value: -5.039186477661133
Epoch: 18 Loss Value: -5.015294075012207
Epoch: 19 Loss Value: -5.027140140533447
Epoch: 20 Loss Value: -5.055109024047852
Epoch: 21 Loss Value: -5.085672378540039
Epoch: 22 Loss Value: -5.061043739318848
Epoch: 23 Loss Value: -5.055

Epoch: 195 Loss Value: -5.178333759307861
Epoch: 196 Loss Value: -5.207535266876221
Epoch: 197 Loss Value: -5.134806156158447
Epoch: 198 Loss Value: -5.199157238006592
Epoch: 199 Loss Value: -5.166072368621826
Epoch: 200 Loss Value: -5.160897254943848
Epoch: 201 Loss Value: -5.237556457519531
Epoch: 202 Loss Value: -5.127991676330566
Epoch: 203 Loss Value: -5.138577938079834
Epoch: 204 Loss Value: -5.236991882324219
Epoch: 205 Loss Value: -5.178390979766846
Epoch: 206 Loss Value: -5.207624912261963
Epoch: 207 Loss Value: -5.134852409362793
Epoch: 208 Loss Value: -5.19924783706665
Epoch: 209 Loss Value: -5.166126728057861
Epoch: 210 Loss Value: -5.160961151123047
Epoch: 211 Loss Value: -5.237647533416748
Epoch: 212 Loss Value: -5.128021240234375
Epoch: 213 Loss Value: -5.13861608505249
Epoch: 214 Loss Value: -5.237084865570068
Epoch: 215 Loss Value: -5.178433895111084
Epoch: 216 Loss Value: -5.207726955413818
Epoch: 217 Loss Value: -5.134887218475342
Epoch: 218 Loss Value: -5.1993284225

Epoch: 391 Loss Value: -5.427879810333252
Epoch: 392 Loss Value: -5.255983829498291
Epoch: 393 Loss Value: -5.2495646476745605
Epoch: 394 Loss Value: -5.237850666046143
Epoch: 395 Loss Value: -5.39218282699585
Epoch: 396 Loss Value: -5.29127836227417
Epoch: 397 Loss Value: -5.242871284484863
Epoch: 398 Loss Value: -5.239727973937988
Epoch: 399 Loss Value: -5.233037948608398
Epoch: 400 Loss Value: -5.289891242980957
Epoch: 401 Loss Value: -5.4279351234436035
Epoch: 402 Loss Value: -5.256014347076416
Epoch: 403 Loss Value: -5.249634742736816
Epoch: 404 Loss Value: -5.237874507904053
Epoch: 405 Loss Value: -5.392242908477783
Epoch: 406 Loss Value: -5.291306018829346
Epoch: 407 Loss Value: -5.242901802062988
Epoch: 408 Loss Value: -5.239750862121582
Epoch: 409 Loss Value: -5.233057975769043
Epoch: 410 Loss Value: -5.289928913116455
Epoch: 411 Loss Value: -5.427987098693848
Epoch: 412 Loss Value: -5.256049156188965
Epoch: 413 Loss Value: -5.249666690826416
Epoch: 414 Loss Value: -5.23788976

Epoch: 586 Loss Value: -5.291572093963623
Epoch: 587 Loss Value: -5.24315881729126
Epoch: 588 Loss Value: -5.239933967590332
Epoch: 589 Loss Value: -5.233262538909912
Epoch: 590 Loss Value: -5.290223598480225
Epoch: 591 Loss Value: -5.428426265716553
Epoch: 592 Loss Value: -5.256320476531982
Epoch: 593 Loss Value: -5.249917030334473
Epoch: 594 Loss Value: -5.238025665283203
Epoch: 595 Loss Value: -5.392719268798828
Epoch: 596 Loss Value: -5.291580677032471
Epoch: 597 Loss Value: -5.2431640625
Epoch: 598 Loss Value: -5.239938735961914
Epoch: 599 Loss Value: -5.233269214630127
Epoch: 600 Loss Value: -5.290231227874756
Epoch: 601 Loss Value: -5.42843770980835
Epoch: 602 Loss Value: -5.256327152252197
Epoch: 603 Loss Value: -5.249922275543213
Epoch: 604 Loss Value: -5.238028526306152
Epoch: 605 Loss Value: -5.392726898193359
Epoch: 606 Loss Value: -5.291585445404053
Epoch: 607 Loss Value: -5.243170261383057
Epoch: 608 Loss Value: -5.23994255065918
Epoch: 609 Loss Value: -5.233273029327393


Epoch: 782 Loss Value: -5.256384372711182
Epoch: 783 Loss Value: -5.24997615814209
Epoch: 784 Loss Value: -5.238073825836182
Epoch: 785 Loss Value: -5.392818450927734
Epoch: 786 Loss Value: -5.291641712188721
Epoch: 787 Loss Value: -5.243220806121826
Epoch: 788 Loss Value: -5.2399821281433105
Epoch: 789 Loss Value: -5.233314514160156
Epoch: 790 Loss Value: -5.290297031402588
Epoch: 791 Loss Value: -5.428532600402832
Epoch: 792 Loss Value: -5.2563862800598145
Epoch: 793 Loss Value: -5.249978065490723
Epoch: 794 Loss Value: -5.238080024719238
Epoch: 795 Loss Value: -5.392819881439209
Epoch: 796 Loss Value: -5.2916436195373535
Epoch: 797 Loss Value: -5.243221282958984
Epoch: 798 Loss Value: -5.239983081817627
Epoch: 799 Loss Value: -5.2333149909973145
Epoch: 800 Loss Value: -5.290297031402588
Epoch: 801 Loss Value: -5.428534507751465
Epoch: 802 Loss Value: -5.256387710571289
Epoch: 803 Loss Value: -5.249978065490723
Epoch: 804 Loss Value: -5.2380805015563965
Epoch: 805 Loss Value: -5.3928

Epoch: 977 Loss Value: -5.243237018585205
Epoch: 978 Loss Value: -5.239994525909424
Epoch: 979 Loss Value: -5.233327865600586
Epoch: 980 Loss Value: -5.290315628051758
Epoch: 981 Loss Value: -5.428560256958008
Epoch: 982 Loss Value: -5.25640344619751
Epoch: 983 Loss Value: -5.249993801116943
Epoch: 984 Loss Value: -5.238090991973877
Epoch: 985 Loss Value: -5.39284610748291
Epoch: 986 Loss Value: -5.249997615814209
Epoch: 987 Loss Value: -5.243237495422363
Epoch: 988 Loss Value: -5.239995002746582
Epoch: 989 Loss Value: -5.233328342437744
Epoch: 990 Loss Value: -5.290315628051758
Epoch: 991 Loss Value: -5.428560733795166
Epoch: 992 Loss Value: -5.256403923034668
Epoch: 993 Loss Value: -5.249994277954102
Epoch: 994 Loss Value: -5.238091468811035
Epoch: 995 Loss Value: -5.392846584320068
Epoch: 996 Loss Value: -5.291660785675049
Epoch: 997 Loss Value: -5.2432379722595215
Epoch: 998 Loss Value: -5.239995956420898
Epoch: 999 Loss Value: -5.233328819274902


In [100]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, 
                           Variable(ys), 
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
#         print(out)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys
    

In [94]:
model.eval()

EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0): SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SublayerConnection(
            (norm): LayerNorm()

In [86]:
x,inp,y,z,output=batch2TrainData(voc,[pairs[25]])

In [87]:
print(inp)
inp_mask=(inp!=0).squeeze(-2)
inp_mask=inp_mask.view(-1,1,12)
print(inp_mask)

tensor([[  1,  42,  61, 116, 117, 118,  40, 119,   4,   2,   0,   0]])
tensor([[[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          False, False]]])


In [88]:
z=greedy_decode(model,inp,inp_mask,12,1)

tensor([[[-0.6587, -1.1393,  1.1520,  0.7710,  1.2487, -0.9607, -1.4645,
          -0.5490, -0.8521,  1.4028,  0.9560,  1.0376, -1.5187, -1.3949,
          -0.9483,  1.2080, -1.0340, -1.2880,  0.4887,  1.3494, -1.2419,
           1.3007,  0.7861, -0.8768,  0.4941,  1.4953,  1.1497,  1.3361,
          -1.2833,  1.3443, -0.4097,  1.4348, -0.9108,  0.8396, -1.4078,
           1.2781,  1.0088,  0.5646, -0.5608,  1.0984,  1.2733, -0.6329,
          -1.4326, -1.3093, -0.6450, -0.7692,  0.7827, -0.3980, -0.6728,
          -0.9395,  1.0451, -0.5838, -1.2449, -0.2876, -0.5841,  1.6100,
           1.4215, -1.3143,  0.9311, -1.3905, -1.0593,  0.9174, -1.5327,
           1.3168, -0.5711,  0.7740,  0.5616, -0.4508,  0.6838,  0.4481,
           0.6446, -1.4336,  1.2074,  1.2578,  0.8648, -1.0734,  1.4286,
          -1.1363, -1.1198, -1.0466,  0.9114,  0.7474, -1.4010,  0.7102,
          -0.7590,  0.5141,  1.4681,  0.6093, -1.1598, -0.8060, -1.3878,
           0.6433, -0.6490, -0.5237, -1.3767,  0.91

In [89]:
print(z)

tensor([[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]])


In [62]:
h=torch.tensor([1,2,3])
print(h[:-1])

tensor([1, 2])


In [98]:
print(batch.src[0].size())

torch.Size([12])


In [105]:
for batch in batches:
    for i in range(5):
        output=greedy_decode(model,batch.src[i].view(-1,12),batch.src_mask[i].view(1,-1,12),12,1)
        print(str(output)+" "+str(batch.trg[i]))
    

tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([  1,   5, 115, 101,   6,   2,   0,   0,   0,   0,   0])
tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([  1, 857, 111,   4,  25, 118,  40, 359,   4,   2,   0])
tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([   1,  147,   37,   70, 1610,    6,    2,    0,    0,    0,    0])
tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([   1, 1153,   50,  179,    7, 1244,    4,    2,    0,    0,    0])
tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([   1, 2598, 1502,    4,    2,    0,    0,    0,    0,    0,    0])
tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([ 1, 34,  4,  2,  0,  0,  0,  0,  0,  0,  0])
tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([   1,  101, 2152,   21,    4,    2,    0,    0,    0,    0,    0])
tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([  1,  59,  83, 359,   4,   2,   0,   0,   0,   0,   0])
tensor([[1, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2]]) tensor([ 1, 64,  6,  2,  0,  0, 