In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
def describe(s,x,t):
    if t:
        print(s+':==============================\n')
        print(x,x.shape)
        print('==============================\n')

In [3]:
class customizedModule(nn.Module):
    def __init(self):
        super(customizedModule,self).__init()
    def customizedLinear(self,in_dim,out_dim,activation=None,dropout=False):
        c1 = nn.Sequential(nn.Linear(in_dim,out_dim))
        nn.init.xavier_uniform_(c1[0].weight)
        nn.init.constant_(c1[0].bias,0)
        
        if activation is not None:
            c1.add_module(str(len(c1)),activation)
        if dropout:
            c1.add_module(str(len(c1)),nn.Dropout(p=self.args.dropout))  
        return c1

# CrossAttention

In [4]:
class CrossAttention(customizedModule):
    def __init__(self,dx,dq,mode):
        super(CrossAttention,self).__init__()
        self.w1 = self.customizedLinear(dx,dx)
        self.w2 = self.customizedLinear(dq,dx)   
        self.w1[0].bias.requires_grad = False
        self.w2[0].bias.requires_grad = False
        
        # bias for add attention
        self.wt = self.customizedLinear(dx,1)
        self.wt[0].bias.requires_grad = False
        self.bsa = nn.Parameter(torch.zeros(dx))  
        # 'mul' or 'add'
        self.mode = mode  
        self.debug = False
    def forward(self,x,q):
        if self.mode is 'mul':     
            # W(1)x W(2)c
            wx = self.w1(x)
            wq = self.w2(q)
            wq = wq.unsqueeze(-2)    
            describe('wx',wx,self.debug)
            describe('wq',wq,self.debug)         
            # <x,q>
            p = wx*wq
            describe('wx * wq',p,self.debug)               
            # p = [a0,a1,a2...]
            p = torch.sum(p,dim=-1,keepdim=True)
            describe('p after sum dim = -1',p,self.debug)        
            # softmax along row       
            p = F.softmax(p,dim=-2)
            describe('p sm(row)',p,self.debug)        
            #p = torch.reshape(p,(p.size(0),-1))
            return p
        
        elif self.mode is 'add':   
            describe('x is',x,self.debug)
            describe('q is',q,self.debug)
            wx = self.w1(x)
            wq = self.w2(q) 
            wq = wq.unsqueeze(-2)
            describe('wx',wx,self.debug)
            describe('wq',wq,self.debug)
            describe('wx+wq',wx+wq,self.debug)
            describe('bsa',self.bsa,self.debug)
            describe('wx+wq+bsa',wx+wq+self.bsa,self.debug)
            p = self.wt(wx+wq+self.bsa)
            describe('wt',p,self.debug)  
            p = F.softmax(p,dim = -2)
            describe('sm',p,self.debug)
            return p
        else:
            raise NotImplementedError('CrossAttention error:<mul or add>')

# position wise feedforward network

In [5]:
class PositionwiseFeedForward(customizedModule):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = self.customizedLinear(d_in, d_hid) # position-wise
        self.w_2 = self.customizedLinear(d_hid, d_in) # position-wise
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        x = self.w_2(F.relu(self.w_1(x)))
        x = self.dropout(x)
        x += residual
        x = self.layer_norm(x)
        return x

# CSA

In [14]:
class CSA(customizedModule):
    def __init__(self,args,dx,dq):
        super(CSA,self).__init__()
        self.args = args
        self.dx = dx
        self.dq = dq  
        if self.args.csa_mode is 'mul':
            self.crossAttention = CrossAttention(dx,dq,'mul')
        elif self.args.csa_mode is 'add':
            self.crossAttention = CrossAttention(dx,dq,'add')
        else:
            raise NotImplementedError('CSA->CrossAttention error')
        
        self.Wsa1 = self.customizedLinear(dx,dx)
        self.Wsa2 = self.customizedLinear(dx,dx)
        self.Wsa1[0].bias.requires_grad = False
        self.Wsa2[0].bias.requires_grad = False
        self.wsat = self.customizedLinear(dx,1)
        self.bsa1 = nn.Parameter(torch.zeros(dx))  
        self.bsa2 = nn.Parameter(torch.zeros(dx)) 
        
        self.debug = True
        self.PFN = PositionwiseFeedForward(dx,dx)
    def forward(self,x,c):
        # x(batch,seq_len,word_dim) c(batch,word_dim)
        seq_len = x.size(-2)
        p = self.crossAttention(x,c)
        describe('p',p,self.debug)
        h = x*p
        describe('h',h,self.debug)
        # p = (seq_len*seq_len): the attention of xi to xj
        hi = self.Wsa1(h)
        hj = self.Wsa2(h)
        hi = hi.unsqueeze(-2)
        hj = hj.unsqueeze(-3)
        
        #fcsa(xi,xj|c)
        fcsa = hi+hj+self.bsa1
        describe('fcsa',fcsa,self.debug)
        fcsa = self.wsat(fcsa)
        describe('w(fcsa)',fcsa,self.debug)
        fcsa = torch.sigmoid(fcsa)
        describe('sigmoid fcsa',fcsa,self.debug)
        fcsa = fcsa.squeeze()
        describe('squeeze(fcsa)',fcsa,self.debug)     
        
        # mask 對角
        M = Variable(torch.eye(seq_len)).to(self.args.gpu).detach()
        M[M==1]= float('-inf')
        fcsa = fcsa+M
        describe('fcsa+M',fcsa,self.debug)
          
            
        fcsa = F.softmax(fcsa,dim=-1)  
        describe('fcsa after sm',fcsa,self.debug)
        
        
       
        fcsa = fcsa.unsqueeze(-1)
        describe('after pmatrix add one dim',fcsa,self.debug)
        # fcsa (batch,sqlen,sqlen,fcsa(xi,xj))
        # x (batch,1,sqlen,word_dim)
        ui = fcsa*x.unsqueeze(1) 
        describe('unsqeeze x',x.unsqueeze(1),self.debug)
        describe('ui=pMatrix*x',ui,self.debug)
        ui = torch.sum(ui,1)
        describe('ui after sum dim -1',ui,self.debug)   
        ui = self.PFN(ui)
        describe('ui after PFN',ui,self.debug)   
        return  ui

# this is test

In [15]:
from torchtext import data
from torchtext.vocab import GloVe
import torch
import spacy
from torchtext.data import Iterator, BucketIterator

In [16]:
class getHotpotData():
    def __init__(self,args,trainPath,devPath,):
        self.nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])   
        self.trainpath= trainPath
        self.devpath= devPath
        
        self.ANSWER  = data.Field(tokenize = self.tokenizer)
        self.QUESTION = data.Field(tokenize = self.tokenizer)
        self.CONTEXT = data.Field(tokenize = self.tokenizer)
        fields = {'context':('Context', self.CONTEXT),'answer':('Answer', self.ANSWER),'question':('Question', self.QUESTION)}
        
        self.train = data.TabularDataset(path = self.trainpath,format='csv',fields=fields)
        self.dev = data.TabularDataset(path = self.devpath,format='csv',fields=fields)
        
        self.CONTEXT.build_vocab(self.train, vectors=GloVe(name='6B', dim=300))  
        self.QUESTION.build_vocab(self.train, vectors=GloVe(name='6B', dim=300)) 
        self.ANSWER.build_vocab(self.train)
        
        self.train_iter,self.dev_iter = data.BucketIterator.splits((self.train,self.dev),sort_key=lambda x: len(x.Question),sort_within_batch=True,shuffle=True,batch_size=args.batch_size,device=args.gpu)
       
        print('load hotpot data done')
    def tokenizer(self,text):
        return [str(token) for token in self.nlp(text)]
    
    def calculate_block_size(self, B):
        data_lengths = []
        for e in self.train.examples:
            data_lengths.append(len(e.premise))
            data_lengths.append(len(e.hypothesis))

        mean = np.mean(data_lengths)
        std = np.std(data_lengths)
        self.block_size = int((2 * (std * ((2 * np.log(B)) ** (1/2)) + mean)) ** (1/3))
    



In [17]:
class NN4SNLI(customizedModule):
    def __init__(self, args, data):
        super(NN4SNLI, self).__init__()

        self.args = args
        # set hyperparameters
        # r: length of inner blocks
        #self.args.r = self.args.block_size
       # self.args.c = self.args.mSA_scalar

        self.word_emb = nn.Embedding(len(data.CONTEXT.vocab.vectors), len(data.CONTEXT.vocab.vectors[0]))
        # initialize word embedding with GloVe
        self.word_emb.weight.data.copy_(data.CONTEXT.vocab.vectors)
        # fine-tune the word embedding
        self.word_emb.weight.requires_grad = True


        self.csa = CSA(args,args.word_dim, args.word_dim)

    def forward(self, batch):
        q = self.word_emb(batch.Question)
        a = self.word_emb(batch.Answer)
        a = torch.sum(q,dim = -2)
        x = self.csa(q,a)
        print(x.shape)
        return x

In [18]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', default=8, type=int)
parser.add_argument('--gpu', default=torch.device('cpu' if torch.cuda.is_available() else 'cpu'), type=int)
parser.add_argument('--csa-mode',default='add',type = str)
parser.add_argument('--word-dim',default=300,type = int)
args = parser.parse_args(args=[])
trainpath = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small/smalltrain100.csv'
devpath = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small/smalldev100.csv'
mydata = getHotpotData(args,trainpath,devpath)
model = NN4SNLI(args,mydata)

print('start')

load hotpot data done
start


In [19]:
iterator = mydata.train_iter

In [20]:
for i, batch in enumerate(iterator):
    print('i= '+ str(i))
    model.train()
    batch.Question = batch.Question.transpose(0,1)
    batch.Answer = batch.Answer.transpose(0,1)
    batch.Context = batch.Context.transpose(0,1)
    if i > 1:
        break
    x = model(batch)
    describe('x',x,True)

i= 0

tensor([[[0.2077],
         [0.0457],
         [0.0701],
         [0.0701],
         [0.0673],
         [0.1123],
         [0.0782],
         [0.0433],
         [0.0701],
         [0.0436],
         [0.0369],
         [0.0701],
         [0.0844]],

        [[0.1799],
         [0.0608],
         [0.0973],
         [0.0513],
         [0.0973],
         [0.0608],
         [0.0396],
         [0.0350],
         [0.1101],
         [0.0980],
         [0.0361],
         [0.0732],
         [0.0608]],

        [[0.2029],
         [0.0446],
         [0.0685],
         [0.0503],
         [0.1097],
         [0.0473],
         [0.1098],
         [0.0426],
         [0.0685],
         [0.0360],
         [0.0685],
         [0.0825],
         [0.0685]],

        [[0.1389],
         [0.0783],
         [0.0636],
         [0.0762],
         [0.0780],
         [0.0482],
         [0.0762],
         [0.0585],
         [0.0786],
         [0.0730],
         [0.0583],
         [0.0939],
         [0.0780]],

            0.0000e+00,  0.0000e+00]]]], grad_fn=<AddBackward0>) torch.Size([8, 13, 13, 300])


tensor([[[[ 2.5116e-01],
          [ 8.9230e-02],
          [ 1.0820e-01],
          ...,
          [ 1.0936e-01],
          [ 1.0820e-01],
          [ 9.4060e-02]],

         [[ 1.5528e-01],
          [-6.6508e-03],
          [ 1.2317e-02],
          ...,
          [ 1.3483e-02],
          [ 1.2317e-02],
          [-1.8205e-03]],

         [[ 1.4296e-01],
          [-1.8968e-02],
          [ 0.0000e+00],
          ...,
          [ 1.1661e-03],
          [ 0.0000e+00],
          [-1.4137e-02]],

         ...,

         [[ 1.4180e-01],
          [-2.0124e-02],
          [-1.1560e-03],
          ...,
          [ 1.0164e-05],
          [-1.1560e-03],
          [-1.5293e-02]],

         [[ 1.4296e-01],
          [-1.8968e-02],
          [ 0.0000e+00],
          ...,
          [ 1.1661e-03],
          [ 0.0000e+00],
          [-1.4137e-02]],

         [[ 1.3683e-01],
          [-2.5098e-02],
    

          [0.5000]]]], grad_fn=<SigmoidBackward>) torch.Size([8, 13, 13, 1])


tensor([[[0.5625, 0.5223, 0.5270,  ..., 0.5273, 0.5270, 0.5235],
         [0.5387, 0.4983, 0.5031,  ..., 0.5034, 0.5031, 0.4995],
         [0.5357, 0.4953, 0.5000,  ..., 0.5003, 0.5000, 0.4965],
         ...,
         [0.5354, 0.4950, 0.4997,  ..., 0.5000, 0.4997, 0.4962],
         [0.5357, 0.4953, 0.5000,  ..., 0.5003, 0.5000, 0.4965],
         [0.5342, 0.4937, 0.4985,  ..., 0.4988, 0.4985, 0.4949]],

        [[0.5542, 0.5234, 0.5302,  ..., 0.5169, 0.5204, 0.5234],
         [0.5309, 0.5000, 0.5068,  ..., 0.4934, 0.4969, 0.5000],
         [0.5438, 0.5130, 0.5198,  ..., 0.5064, 0.5099, 0.5130],
         ...,
         [0.5292, 0.4983, 0.5051,  ..., 0.4917, 0.4952, 0.4983],
         [0.5296, 0.4987, 0.5055,  ..., 0.4921, 0.4956, 0.4987],
         [0.5309, 0.5000, 0.5068,  ..., 0.4934, 0.4969, 0.5000]],

        [[0.5610, 0.5218, 0.5264,  ..., 0.5264, 0.5230, 0.5264],
         [0.5379, 0.4984, 0.5030,  ..., 0.50

          [0.0000]]]], grad_fn=<UnsqueezeBackward0>) torch.Size([8, 13, 13, 1])


tensor([[[[-0.1083, -0.2620, -0.4405,  ...,  0.0137, -0.4914, -0.0845],
          [-0.1083,  0.4610,  0.2451,  ..., -0.2406, -0.2503, -0.1641],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [-0.2623,  0.1263, -0.0842,  ..., -0.6686, -0.2263,  0.1499],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539]]],


        [[[-0.1083, -0.2620, -0.4405,  ...,  0.0137, -0.4914, -0.0845],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0385, -0.0398,  0.0827,  ..., -0.3343,  0.0118,  0.0597],
          ...,
          [-0.8883, -0.2485, -0.0355,  ...,  0.2312, -0.1904,  0.4954],
          [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],


        [[[-0.10

       grad_fn=<MulBackward0>) torch.Size([8, 13, 13, 300])


tensor([[[-0.1116, -0.2700, -0.4540,  ...,  0.0141, -0.5064, -0.0871],
         [-0.1072,  0.4565,  0.2426,  ..., -0.2382, -0.2478, -0.1625],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [-0.2610,  0.1257, -0.0838,  ..., -0.6653, -0.2252,  0.1492],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0462,  0.2113, -0.0074,  ...,  0.0090, -0.2081,  0.0534]],

        [[-0.1114, -0.2695, -0.4531,  ...,  0.0141, -0.5054, -0.0869],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0386, -0.0400,  0.0831,  ..., -0.3357,  0.0119,  0.0600],
         ...,
         [-0.8804, -0.2462, -0.0352,  ...,  0.2291, -0.1887,  0.4910],
         [ 0.0463,  0.2120, -0.0074,  ...,  0.0090, -0.2087,  0.0536],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.1117, -0.2701, -0.4540,  ...,  0.0141, -0

           0.0000e+00,  0.0000e+00]]], grad_fn=<NativeLayerNormBackward>) torch.Size([8, 13, 300])

i= 1

tensor([[[0.0435],
         [0.0152],
         [0.0237],
         [0.0310],
         [0.0164],
         [0.0254],
         [0.0513],
         [0.0335],
         [0.0183],
         [0.0245],
         [0.0245],
         [0.0151],
         [0.0199],
         [0.0335],
         [0.0245],
         [0.0375],
         [0.0261],
         [0.0448],
         [0.0245],
         [0.0245],
         [0.0254],
         [0.0214],
         [0.0239],
         [0.0272],
         [0.0245],
         [0.0381],
         [0.0266],
         [0.0245],
         [0.0239],
         [0.0216],
         [0.0468],
         [0.0245],
         [0.0378],
         [0.0294],
         [0.0468]],

        [[0.0378],
         [0.0212],
         [0.0151],
         [0.0212],
         [0.0443],
         [0.0231],
         [0.0212],
         [0.0186],
         [0.0406],
         [0.0212],
         [0.0336],
         [0.0406],

       grad_fn=<MulBackward0>) torch.Size([8, 35, 300])


tensor([[[[-2.3959e-02,  1.8510e-02, -1.4096e-03,  ..., -1.0801e-02,
            1.4476e-02,  7.0494e-03],
          [-1.7446e-02,  2.8837e-02, -7.2387e-03,  ..., -2.5570e-02,
            1.7922e-02, -5.9244e-03],
          [-2.1811e-02,  4.3759e-03, -8.9538e-04,  ..., -1.6457e-02,
            3.3977e-02, -1.1033e-02],
          ...,
          [-2.6950e-02,  2.0324e-02, -5.3670e-03,  ..., -1.9040e-02,
            1.1004e-02, -1.8508e-02],
          [-1.3982e-02,  1.2943e-02,  7.3994e-03,  ..., -1.2403e-02,
            1.5195e-02, -1.0396e-02],
          [-1.1904e-02,  1.8071e-02,  2.0089e-02,  ..., -1.9538e-02,
            1.3440e-02, -1.6250e-02]],

         [[-2.2555e-03,  1.4051e-02, -1.2922e-02,  ..., -4.3862e-03,
            3.8413e-03,  1.1703e-02],
          [ 4.2573e-03,  2.4378e-02, -1.8751e-02,  ..., -1.9155e-02,
            7.2870e-03, -1.2705e-03],
          [-1.0738e-04, -8.3035e-05, -1.2408e-02,  ..., -1.0042e-02,


            0.0000e+00,  0.0000e+00]]]], grad_fn=<AddBackward0>) torch.Size([8, 35, 35, 300])


tensor([[[[ 0.0243],
          [ 0.0021],
          [ 0.0319],
          ...,
          [ 0.0102],
          [-0.0043],
          [-0.0182]],

         [[ 0.0271],
          [ 0.0049],
          [ 0.0347],
          ...,
          [ 0.0130],
          [-0.0014],
          [-0.0154]],

         [[ 0.0308],
          [ 0.0087],
          [ 0.0384],
          ...,
          [ 0.0167],
          [ 0.0023],
          [-0.0116]],

         ...,

         [[ 0.0335],
          [ 0.0114],
          [ 0.0411],
          ...,
          [ 0.0194],
          [ 0.0050],
          [-0.0089]],

         [[ 0.0214],
          [-0.0007],
          [ 0.0291],
          ...,
          [ 0.0073],
          [-0.0071],
          [-0.0210]],

         [[ 0.0453],
          [ 0.0231],
          [ 0.0529],
          ...,
          [ 0.0312],
          [ 0.0168],
          [ 0.0028]]],


        [[[ 0.0210],
        

          [0.5000]]]], grad_fn=<SigmoidBackward>) torch.Size([8, 35, 35, 1])


tensor([[[0.5061, 0.5005, 0.5080,  ..., 0.5025, 0.4989, 0.4954],
         [0.5068, 0.5012, 0.5087,  ..., 0.5032, 0.4996, 0.4962],
         [0.5077, 0.5022, 0.5096,  ..., 0.5042, 0.5006, 0.4971],
         ...,
         [0.5084, 0.5028, 0.5103,  ..., 0.5049, 0.5013, 0.4978],
         [0.5054, 0.4998, 0.5073,  ..., 0.5018, 0.4982, 0.4947],
         [0.5113, 0.5058, 0.5132,  ..., 0.5078, 0.5042, 0.5007]],

        [[0.5053, 0.5001, 0.5011,  ..., 0.5049, 0.5022, 0.4991],
         [0.5051, 0.5000, 0.5010,  ..., 0.5047, 0.5020, 0.4989],
         [0.5067, 0.5016, 0.5026,  ..., 0.5063, 0.5036, 0.5005],
         ...,
         [0.5047, 0.4996, 0.5006,  ..., 0.5043, 0.5016, 0.4985],
         [0.5070, 0.5019, 0.5028,  ..., 0.5066, 0.5039, 0.5008],
         [0.5047, 0.4995, 0.5005,  ..., 0.5043, 0.5016, 0.4985]],

        [[0.5000, 0.5022, 0.5025,  ..., 0.4985, 0.4987, 0.5000],
         [0.5012, 0.5033, 0.5036,  ..., 0.49

          [0.0000]]]], grad_fn=<UnsqueezeBackward0>) torch.Size([8, 35, 35, 1])


tensor([[[[ 0.0735,  0.2574,  0.2359,  ..., -0.4011, -0.5682, -0.4686],
          [ 0.0406, -0.0284,  0.2501,  ..., -0.7256, -0.1930,  0.2275],
          [ 0.0963, -0.2143,  0.1403,  ..., -0.3957, -0.1794,  0.5939],
          ...,
          [-0.5952,  0.3236, -0.0453,  ..., -0.2919, -0.0519,  0.0039],
          [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
          [-0.0769, -0.0212,  0.2127,  ...,  0.1835, -0.2918, -0.0465]]],


        [[[ 0.0735,  0.2574,  0.2359,  ..., -0.4011, -0.5682, -0.4686],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [-0.3676,  0.3950, -0.2703,  ..., -0.3725, -0.3899,  0.0315],
          ...,
          [-0.2213,  0.4714, -0.1666,  ..., -0.4044, -0.6878,  0.0392],
          [-0.1088,  0.0182, -0.0241,  ..., -0.8055, -0.3016, -0.1170],
          [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539]]],


        [[[ 0.00

       grad_fn=<MulBackward0>) torch.Size([8, 35, 35, 300])


tensor([[[ 0.0738,  0.2585,  0.2369,  ..., -0.4028, -0.5707, -0.4706],
         [ 0.0405, -0.0283,  0.2498,  ..., -0.7246, -0.1928,  0.2272],
         [ 0.0969, -0.2156,  0.1412,  ..., -0.3982, -0.1805,  0.5976],
         ...,
         [-0.5957,  0.3238, -0.0453,  ..., -0.2922, -0.0519,  0.0039],
         [ 0.0464,  0.2126, -0.0074,  ...,  0.0090, -0.2093,  0.0538],
         [-0.0765, -0.0211,  0.2114,  ...,  0.1823, -0.2900, -0.0462]],

        [[ 0.0739,  0.2586,  0.2370,  ..., -0.4030, -0.5710, -0.4709],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [-0.3678,  0.3953, -0.2705,  ..., -0.3728, -0.3902,  0.0315],
         ...,
         [-0.2223,  0.4735, -0.1673,  ..., -0.4062, -0.6908,  0.0394],
         [-0.1090,  0.0183, -0.0242,  ..., -0.8069, -0.3021, -0.1172],
         [ 0.0465,  0.2129, -0.0074,  ...,  0.0090, -0.2096,  0.0538]],

        [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0

           0.0000e+00,  0.0000e+00]]], grad_fn=<NativeLayerNormBackward>) torch.Size([8, 35, 300])

i= 2


# Encoder
+ self-attention
+ S2T

In [13]:
#class selfAttention(customizedModule):
#    super(selfAttention,self):
        
#    def forward(x):