In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
def describe(s,x,t):
    if t:
        print(s+':==============================\n')
        print(x,x.shape)
        print('==============================\n')

In [3]:
class customizedModule(nn.Module):
    def __init(self):
        super(customizedModule,self).__init()
    def customizedLinear(self,in_dim,out_dim,activation=None,dropout=False):
        c1 = nn.Sequential(nn.Linear(in_dim,out_dim))
        nn.init.xavier_uniform_(c1[0].weight)
        nn.init.constant_(c1[0].bias,0)
        
        if activation is not None:
            c1.add_module(str(len(c1)),activation)
        if dropout:
            c1.add_module(str(len(c1)),nn.Dropout(p=self.args.dropout))  
        return c1

# CrossAttention

In [4]:
class CrossAttention(customizedModule):
    def __init__(self,dx,dq,mode):
        super(CrossAttention,self).__init__()
        self.w1 = self.customizedLinear(dx,dx)
        self.w2 = self.customizedLinear(dq,dx)   
        self.w1[0].bias.requires_grad = False
        self.w2[0].bias.requires_grad = False
        
        # bias for add attention
        self.wt = self.customizedLinear(dx,1)
        self.wt[0].bias.requires_grad = False
        self.bsa = nn.Parameter(torch.zeros(dx))  
        # 'mul' or 'add'
        self.mode = mode  
        self.debug = False
    def forward(self,x,q):
        if self.mode is 'mul':     
            # W(1)x W(2)c
            wx = self.w1(x)
            wq = self.w2(q)
            wq = wq.unsqueeze(-2)    
            describe('wx',wx,self.debug)
            describe('wq',wq,self.debug)         
            # <x,q>
            p = wx*wq
            describe('wx * wq',p,self.debug)               
            # p = [a0,a1,a2...]
            p = torch.sum(p,dim=-1,keepdim=True)
            describe('p after sum dim = -1',p,self.debug)        
            # softmax along row       
            p = F.softmax(p,dim=-2)
            describe('p sm(row)',p,self.debug)        
            #p = torch.reshape(p,(p.size(0),-1))
            return p
        
        elif self.mode is 'add':   
            describe('x is',x,self.debug)
            describe('q is',q,self.debug)
            wx = self.w1(x)
            wq = self.w2(q) 
            wq = wq.unsqueeze(-2)
            describe('wx',wx,self.debug)
            describe('wq',wq,self.debug)
            describe('wx+wq',wx+wq,self.debug)
            describe('bsa',self.bsa,self.debug)
            describe('wx+wq+bsa',wx+wq+self.bsa,self.debug)
            p = self.wt(wx+wq+self.bsa)
            describe('wt',p,self.debug)  
            p = F.softmax(p,dim = -2)
            describe('sm',p,self.debug)
            return p
        else:
            raise NotImplementedError('CrossAttention error:<mul or add>')

# position wise feedforward network

In [5]:
class PositionwiseFeedForward(customizedModule):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = self.customizedLinear(d_in, d_hid) # position-wise
        self.w_2 = self.customizedLinear(d_hid, d_in) # position-wise
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        x = self.w_2(F.relu(self.w_1(x)))
        x = self.dropout(x)
        x += residual
        x = self.layer_norm(x)
        return x

# CSA

In [6]:
class CSA(customizedModule):
    def __init__(self,args,dx,dq):
        super(CSA,self).__init__()
        self.args = args
        self.dx = dx
        self.dq = dq  
        if self.args.csa_mode is 'mul':
            self.crossAttention = CrossAttention(dx,dq,'mul')
        elif self.args.csa_mode is 'add':
            self.crossAttention = CrossAttention(dx,dq,'add')
        else:
            raise NotImplementedError('CSA->CrossAttention error')
        
        self.Wsa1 = self.customizedLinear(dx,dx)
        self.Wsa2 = self.customizedLinear(dx,dx)
        self.Wsa1[0].bias.requires_grad = False
        self.Wsa2[0].bias.requires_grad = False
        self.wsat = self.customizedLinear(dx,1)
        self.bsa1 = nn.Parameter(torch.zeros(dx))  
        self.bsa2 = nn.Parameter(torch.zeros(dx)) 
        
        self.debug = False
        self.PFN = PositionwiseFeedForward(dx,dx)
    def forward(self,x,c):
        # x(batch,seq_len,word_dim) c(batch,word_dim)
        seq_len = x.size(-2)
        p = self.crossAttention(x,c)
        describe('p',p,self.debug)
        h = x*p
        describe('h',h,self.debug)
        # p = (seq_len*seq_len): the attention of xi to xj
        hi = self.Wsa1(h)
        hj = self.Wsa2(h)
        hi = hi.unsqueeze(-2)
        hj = hj.unsqueeze(-3)
        
        #fcsa(xi,xj|c)
        fcsa = hi+hj+self.bsa1
        describe('fcsa',fcsa,self.debug)
        fcsa = self.wsat(fcsa)
        describe('w(fcsa)',fcsa,self.debug)
        fcsa = torch.sigmoid(fcsa)
        describe('sigmoid fcsa',fcsa,self.debug)
        fcsa = fcsa.squeeze()
        describe('squeeze(fcsa)',fcsa,self.debug)     
        
        # mask 對角
        M = Variable(torch.eye(seq_len)).to(self.args.gpu).detach()
        M[M==1]= float('-inf')
        fcsa = fcsa+M
        describe('fcsa+M',fcsa,self.debug)
          
            
        fcsa = F.softmax(fcsa,dim=-1)  
        describe('fcsa after sm',fcsa,self.debug)
        
        
       
        fcsa = fcsa.unsqueeze(-1)
        describe('after pmatrix add one dim',fcsa,self.debug)
        # fcsa (batch,sqlen,sqlen,fcsa(xi,xj))
        # x (batch,1,sqlen,word_dim)
        ui = fcsa*x.unsqueeze(1) 
        describe('unsqeeze x',x.unsqueeze(1),self.debug)
        describe('ui=pMatrix*x',ui,self.debug)
        ui = torch.sum(ui,1)
        describe('ui after sum dim -1',ui,self.debug)   
        ui = self.PFN(ui)
        describe('ui after PFN',ui,self.debug)   
        return  ui

# this is test

In [7]:
from torchtext import data
from torchtext.vocab import GloVe
import torch
import spacy
from torchtext.data import Iterator, BucketIterator

In [8]:
class getHotpotData():
    def __init__(self,args,trainPath,devPath,):
        self.nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])   
        self.trainpath= trainPath
        self.devpath= devPath
        
        self.ANSWER  = data.Field(tokenize = self.tokenizer)
        self.QUESTION = data.Field(tokenize = self.tokenizer)
        self.CONTEXT = data.Field(tokenize = self.tokenizer)
        fields = {'context':('Context', self.CONTEXT),'answer':('Answer', self.ANSWER),'question':('Question', self.QUESTION)}
        
        self.train = data.TabularDataset(path = self.trainpath,format='csv',fields=fields)
        self.dev = data.TabularDataset(path = self.devpath,format='csv',fields=fields)
        
        self.CONTEXT.build_vocab(self.train, vectors=GloVe(name='6B', dim=300))  
        self.QUESTION.build_vocab(self.train, vectors=GloVe(name='6B', dim=300)) 
        self.ANSWER.build_vocab(self.train, vectors=GloVe(name='6B', dim=300))
        
        self.train_iter = data.BucketIterator(dataset=self.train, batch_size=args.batch_size, shuffle=True, sort_within_batch=False, repeat=False)
        self.dev_iter = data.BucketIterator(dataset=self.dev, batch_size=args.batch_size, shuffle=True, sort_within_batch=False, repeat=False)
       
        print('load hotpot data done')
    def tokenizer(self,text):
        return [str(token) for token in self.nlp(text)]
    
    def calculate_block_size(self, B):
        data_lengths = []
        for e in self.train.examples:
            data_lengths.append(len(e.premise))
            data_lengths.append(len(e.hypothesis))

        mean = np.mean(data_lengths)
        std = np.std(data_lengths)
        self.block_size = int((2 * (std * ((2 * np.log(B)) ** (1/2)) + mean)) ** (1/3))
    



In [16]:
class NN4SNLI(customizedModule):
    def __init__(self, args, data):
        super(NN4SNLI, self).__init__()

        self.args = args
        # set hyperparameters
        # r: length of inner blocks
        #self.args.r = self.args.block_size
       # self.args.c = self.args.mSA_scalar

        self.word_emb = nn.Embedding(len(data.CONTEXT.vocab.vectors), len(data.CONTEXT.vocab.vectors[0]))
        # initialize word embedding with GloVe
        self.word_emb.weight.data.copy_(data.CONTEXT.vocab.vectors)
        # fine-tune the word embedding
        self.word_emb.weight.requires_grad = True
        # <unk> vectors is randomly initialized
        nn.init.uniform(self.word_emb.weight.data[0], -0.05, 0.05)

        self.csa = CSA(args,args.word_dim, args.word_dim)

    def forward(self, batch):
        c = self.word_emb(batch.Context)
        q = self.word_emb(batch.Question)
        describe('c',c,True)
        describe('q',q,True)
        x = self.csa(c,q)
        return x

In [17]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', default=32, type=int)
parser.add_argument('--gpu', default=torch.device('cpu' if torch.cuda.is_available() else 'cpu'), type=int)
parser.add_argument('--csa-mode',default='add',type = str)
parser.add_argument('--word-dim',default=300,type = int)
args = parser.parse_args(args=[])
trainpath = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small/smalltrain.csv'
devpath = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small/smalldev.csv'
mydata = getHotpotData(args,devpath,devpath)
model = NN4SNLI(args,mydata)
print('start')

load hotpot data done




start


TypeError: 'generator' object is not subscriptable

In [18]:
iterator = mydata.train_iter
for i, batch in enumerate(iterator):
    present_epoch = int(iterator.epoch)
    if present_epoch > 10:
        break
    x = model(batch)
    print(x)


tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0181, -0.0628, -0.1345,  ..., -0.3822,  0.1050,  0.0631],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0747, -0.0400,  0.4423,  ..., -0.4493,  0.1235, -0.4446],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0013,  0.3651, -0.0774,  ..., -0.1836, -0.7652,  0.3921]],

        [[-0.5258,  0.3754, -0.4145,  ..., -0.0208, -0.0290,  0.0780],
         [ 0.0000,  0.0000,  0.0000,  ...,  

RuntimeError: The size of tensor a (2258) must match the size of tensor b (32) at non-singleton dimension 1

# Encoder
+ self-attention
+ S2T

In [None]:
#class selfAttention(customizedModule):
#    super(selfAttention,self):
        
#    def forward(x):