# Test code


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [4]:
class customizedModule(nn.Module):
    def __init(self):
        super(customizedModule,self).__init()
    def customizedLinear(self,in_dim,out_dim,activation=None,dropout=False):
        c1 = nn.Sequential(nn.Linear(in_dim,out_dim))
        nn.init.xavier_uniform_(c1[0].weight)
        nn.init.constant_(c1[0].bias,0)
        
        if activation is not None:
            c1.add_module(str(len(c1)),activation)
        if dropout:
            c1.add_module(str(len(c1)),nn.Dropout(p=self.args.dropout))  
        return c1

In [3]:
class CrossAttention(customizedModule):
    def __init__(self,dx,dq,mode):
        super(CrossAttention,self).__init__()
        self.w1 = self.customizedLinear(dx,dx)
        self.w2 = self.customizedLinear(dq,dx)   
        self.w1[0].bias.requires_grad = False
        self.w2[0].bias.requires_grad = False
        
        # bias for add attention
        self.wt = self.customizedLinear(dx,1,activation= nn.Sigmoid())
        self.wt[0].bias.requires_grad = False
        self.bsa = nn.Parameter(torch.zeros(dx))  
        # 'mul' or 'add'
        self.mode = mode     
        self.debug = True
    def forward(self,x,q):
        if self.mode is 'mul':
            if self.debug:   
                # W(1)x W(2)c
                print('x and q is\n')
                print(x)
                print(q)
            wx = self.w1(x)
            wq = self.w2(q)  
            if self.debug:   
                # W(1)x W(2)c
                print('wx and wq is\n')
                print(wx)
                print(wq)
                
           
                
            # <x,q>
            p = wx*wq             
            # s = [a0,a1,a2...]
            p = torch.sum(s,dim=1)
            # softmax along row
            p = F.softmax(s,dim=0)
            p = torch.reshape(p,(p.size(0),-1))
            if self.debug: 
                print(p)
            # s = [[p1],[p2],[p3]] (xlen,1)
            return p
        elif self.mode is 'add':     
            wx = self.w1(x)
            wq = self.w2(q)  
            p = self.wt(wx+wq+self.bsa)
            p = F.softmax(p,dim = 0)
            p = torch.reshape(p,(p.size(0),-1))
            return p
        else:
            raise NotImplementedError('CrossAttention error:<mul or add>')

In [4]:
class CSA(customizedModule):
    def __init__(self,dx,dq,mode):
        super(CSA,self).__init__()
        self.dx = dx
        self.dq = dq
        if mode is 'mul':
            self.crossAttention = CrossAttention(dx,dq,'mul')
        elif mode is 'add':
            self.crossAttention = CrossAttention(dx,dq,'add')
        else:
            raise NotImplementedError('only fw or bw mask is allowed!')
        self.addCrossAttention = CrossAttention(dx,dx,'add')
        self.debug = True
    def forward(self,x,q):
        # x(seq_len,word_dim) q(word_dim)
        #x = x*self.crossAttention(x,q)
        seq_len = x.size(-2)
        hi = x
        hj = x
        hi = hi.unsqueeze(0)
        hj = hj.unsqueeze(1)
        # p = (seq_len*seq_len): the attention of xi to xj
        pMatrix = self.addCrossAttention(hi,hj)
        M = Variable(torch.eye(seq_len)).detach()
        M[M==1]= float('-inf')
        pMatrix = pMatrix+M
        if self.debug:        
            print('before pmatrix soft:\n')
            print(pMatrix)
            print('after pmatrix soft')
        pMatrix = F.softmax(pMatrix,dim=-1)
        if self.debug:        
            print(pMatrix)
        return  pMatrix

In [5]:

x = torch.Tensor([[0.11,0.22,0.33],[1.1,1.2,1.3],[2.1,2.2,2.3]])
q = torch.Tensor(2)
model = CSA(x.size(-1),q.size(-1),'mul')
res = model(x,q)
print('res is\n')
print(res)

before pmatrix soft:

tensor([[  -inf, 0.3680, 0.3473],
        [0.3165,   -inf, 0.3292],
        [0.2816, 0.3084,   -inf]], grad_fn=<AddBackward0>)
after pmatrix soft
tensor([[0.0000, 0.5052, 0.4948],
        [0.4968, 0.0000, 0.5032],
        [0.4933, 0.5067, 0.0000]], grad_fn=<SoftmaxBackward>)
res is

tensor([[0.0000, 0.5052, 0.4948],
        [0.4968, 0.0000, 0.5032],
        [0.4933, 0.5067, 0.0000]], grad_fn=<SoftmaxBackward>)


In [6]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_hid, n_position=200):
        super(PositionalEncoding, self).__init__()

        # Not a parameter
        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))

    def _get_sinusoid_encoding_table(self, n_position, d_hid):
        ''' Sinusoid position encoding table '''
        # TODO: make it with torch instead of numpy

        def get_position_angle_vec(position):
            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

        return torch.FloatTensor(sinusoid_table).unsqueeze(0)

    def forward(self, x):
        return x + self.pos_table[:, :x.size(1)].clone().detach()

# cross

In [11]:
def describe(s,x):
    print(s+'-----\n')
    print(x,x.shape)
    print('------------------\n')
    

In [218]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [219]:
class CrossAttention(customizedModule):
    def __init__(self,dx,dq,mode):
        super(CrossAttention,self).__init__()
        self.w1 = self.customizedLinear(dx,dx)
        self.w2 = self.customizedLinear(dq,dx)   
        self.w1[0].bias.requires_grad = False
        self.w2[0].bias.requires_grad = False
        
        # bias for add attention
        self.wt = self.customizedLinear(dx,1)
        self.wt[0].bias.requires_grad = False
        self.bsa = nn.Parameter(torch.zeros(dx))  
        # 'mul' or 'add'
        self.mode = mode      
    def forward(self,x,q):
        if self.mode is 'mul':     
            # W(1)x W(2)c
            wx = self.w1(x)
            wq = self.w2(q)
            wq = wq.unsqueeze(-2)
            describe('wx',wx)
            describe('wq',wq)         
            # <x,q>
            p = wx*wq
            describe('wx * wq',p)               
            # p = [a0,a1,a2...]
            p = torch.sum(p,dim=-1,keepdim=True)
            describe('p after sum dim = -1',p)        
            # softmax along row       
            p = F.softmax(p,dim=-2)
            describe('p sm(row)',p)        
            #p = torch.reshape(p,(p.size(0),-1))
            return p
        
        elif self.mode is 'add':   
            describe('x is',x)
            describe('q is',q)
            wx = self.w1(x)
            wq = self.w2(q) 
            #if wx.size()
            wq = wq.unsqueeze(-2)
            describe('wx',wx)
            describe('wq',wq)
            describe('wx+wq',wx+wq)
            describe('bsa',self.bsa)
            describe('wx+wq+bsa',wx+wq+self.bsa)
            p = self.wt(wx+wq+self.bsa)
            describe('wt',p)  
            p = F.softmax(p,dim = -2)
            describe('sm',p)
            return p
        else:
            raise NotImplementedError('CrossAttention error:<mul or add>')

In [221]:
x = torch.randn(5,3)
y = torch.randn(5)
#model = CrossAttention(x.size(-1),x.size(-1),'add')
#model(x.unsqueeze(-2),x.unsqueeze(-3))
model = CrossAttention(x.size(-1),y.size(-1),'add')
model(x,y)

x is-----

tensor([[-0.6042, -0.1982, -1.8644],
        [-0.9813, -1.5395, -0.6462],
        [ 0.5115, -0.5305, -0.3826],
        [ 0.5821,  2.1120,  0.2511],
        [ 0.7245, -0.6817, -0.2272]]) torch.Size([5, 3])
------------------

q is-----

tensor([ 0.2698, -0.8742, -1.4023, -0.9082, -0.2152]) torch.Size([5])
------------------

wx-----

tensor([[-3.9654e-01, -2.6372e-01,  9.8340e-01],
        [-6.6904e-01,  1.2651e-03,  2.9508e-01],
        [-1.1406e+00, -1.9914e-01,  3.9328e-01],
        [ 1.3939e+00,  2.1127e-02, -2.0509e-01],
        [-1.4159e+00, -2.1599e-01,  3.6417e-01]], grad_fn=<AddmmBackward>) torch.Size([5, 3])
------------------

wq-----

tensor([[-1.4661, -0.9107,  0.0682]], grad_fn=<UnsqueezeBackward0>) torch.Size([1, 3])
------------------

wx+wq-----

tensor([[-1.8626, -1.1744,  1.0516],
        [-2.1351, -0.9094,  0.3633],
        [-2.6067, -1.1098,  0.4615],
        [-0.0722, -0.8895, -0.1369],
        [-2.8820, -1.1266,  0.4324]], grad_fn=<AddBackward0>) torch.

tensor([[0.1228],
        [0.2309],
        [0.2659],
        [0.0554],
        [0.3249]], grad_fn=<SoftmaxBackward>)

# Test Data

In [120]:
from torchtext import data
from torchtext.vocab import GloVe
import torch
import spacy
from torchtext.data import Iterator, BucketIterator
tokenize = lambda x : x.split()
class getHotpotData():
    def __init__(self,args,trainPath,devPath,):
        self.nlp = spacy.load('en_core_web_sm')   
        self.trainpath= trainPath
        self.devpath= devPath
        
        self.ANSWER  = data.Field(sequential=True,tokenize = tokenize,lower=True)
        self.QUESTION = data.Field(sequential=True,tokenize = tokenize,lower=True)
       # self.CONTEXT = data.Field(sequential=True,tokenize = self.tokenizer,lower=True)
        
      #  fields = {'context':('Context', self.CONTEXT),'answer':('Answer', self.ANSWER),'question':('Question', self.QUESTION)}
        fields = {'answer':('Answer', self.ANSWER),'question':('Question', self.QUESTION)}
        self.train = data.TabularDataset(path = self.trainpath,format='csv',fields=fields)
        self.dev = data.TabularDataset(path = self.devpath,format='csv',fields=fields)
        
       # self.CONTEXT.build_vocab(self.train,self.dev,vectors=GloVe(name='6B', dim=300))  
        self.QUESTION.build_vocab(self.train,self.dev,vectors=GloVe(name='6B', dim=300)) 
        self.ANSWER.build_vocab(self.train,self.dev)
        
        self.train_iter,self.dev_iter = data.BucketIterator.splits((self.train,self.dev),batch_size=args.batch_size,device=args.gpu)

       # self.train_iter = data.BucketIterator(dataset=self.train, batch_size=args.batch_size, shuffle=True,device=args.gpu)
       # self.dev_iter = data.BucketIterator(dataset=self.dev, batch_size=args.batch_size, shuffle=True,device=args.gpu)
       
        print('load hotpot data done')
    def tokenizer(self,text):
        return [str(token) for token in self.nlp(text)]
    
    def calculate_block_size(self, B):
        data_lengths = []
        for e in self.train.examples:
            data_lengths.append(len(e.premise))
            data_lengths.append(len(e.hypothesis))
 
        mean = np.mean(data_lengths)
        std = np.std(data_lengths)
        self.block_size = int((2 * (std * ((2 * np.log(B)) ** (1/2)) + mean)) ** (1/3))


In [121]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', default=32, type=int)
parser.add_argument('--gpu', default=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'), type=int)
parser.add_argument('--csa-mode',default='add',type = str)
parser.add_argument('--word-dim',default=300,type = int)
args = parser.parse_args(args=[])
    
    
trainpath = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small/smalltrain100.csv'
devpath = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small/smalldev100.csv'
mydata = getHotpotData(args,trainpath,devpath)

load hotpot data done


In [122]:
#mydata.train[0].__dict__.keys()
#mydata.train[0].__dict__.values()

In [123]:
next(iter( mydata.train_iter))


[torchtext.data.batch.Batch of size 32]
	[.Answer]:[torch.cuda.LongTensor of size 115x32 (GPU 0)]
	[.Question]:[torch.cuda.LongTensor of size 40x32 (GPU 0)]

In [124]:
iterator = mydata.train_iter

for i,batch in enumerate(iterator):
    print(i,batch.Question.shape)


0 torch.Size([25, 4])
1 torch.Size([43, 32])
2 torch.Size([42, 32])
3 torch.Size([61, 32])


In [12]:
from torchtext import data
from torchtext.vocab import GloVe
import torch
import spacy
from torchtext.data import Iterator, BucketIterator
nlp = spacy.load('en_core_web_sm')   
tokenize = lambda x : x.split()
def tokenizer(text):
    return [str(token) for token in nlp(text)]

CONTEXT = data.Field(sequential=True,tokenize = tokenize,lower=True)
QUESTION = data.Field(sequential=True,tokenize = tokenizer,lower=True)
ANSWER = data.Field(sequential=True,tokenize = tokenizer,lower=True)

train, dev = data.TabularDataset.splits(
    path='C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small', train='smalltrain2.csv',
    validation='smalldev2.csv', format='csv',skip_header=True,
    fields=[('context',None),('question', QUESTION),('answer', ANSWER), ])

QUESTION.build_vocab(train,dev)
ANSWER.build_vocab(train,dev)



In [37]:
for i in range(0,2):
    print('data:',i)
    print('\tquestion length:',len(train.examples[i].question))
    print('\tanswer length:',len(train.examples[i].answer))

data: 0
	question length: 13
	answer length: 36
data: 1
	question length: 18
	answer length: 36


In [38]:
# 分 batch
train_iter, devl_iter = data.BucketIterator.splits(
    (train, dev), batch_sizes=(2,2),
    sort_key=lambda x: len(x.text))
print(next(iter(train_iter)))


[torchtext.data.batch.Batch of size 2]
	[.question]:[torch.LongTensor of size 18x2]
	[.answer]:[torch.LongTensor of size 36x2]


In [48]:
# 單字表
question_dict = QUESTION.vocab
answer_dict = ANSWER.vocab
for batch in train_iter:   
    print(batch.question)
    # 經過transpose
    Q = batch.question.transpose(0,1)
    print(Q,'\n')
    A = batch.answer.transpose(0,1)
    for qj in Q[0]:
        print(question_dict.itos[qj]+' ',end='')
    print('\n')
    for aj in A[0]:
        print(answer_dict.itos[aj]+' ',end='')  

tensor([[ 3, 44],
        [32,  8],
        [21, 10],
        [29, 40],
        [35,  6],
        [ 9, 14],
        [ 4, 12],
        [28,  8],
        [17, 34],
        [42,  6],
        [25, 23],
        [ 4, 47],
        [26,  2],
        [33,  1],
        [ 7,  1],
        [11,  1],
        [16,  1],
        [ 2,  1]])
tensor([[ 3, 32, 21, 29, 35,  9,  4, 28, 17, 42, 25,  4, 26, 33,  7, 11, 16,  2],
        [44,  8, 10, 40,  6, 14, 12,  8, 34,  6, 23, 47,  2,  1,  1,  1,  1,  1]]) 

the oberoi family is part of a hotel company that has a head office in what city ? 

the oberoi family is an indian family that is famous for its involvement in hotels , namely through the oberoi group . the oberoi group is a hotel company with its head office in delhi . 

# test

importing Jupyter notebook from models.ipynb
