In [1]:
import import_ipynb
from mydata import getHotpotData
from mydata import dataProcesser
import argparse
import torch
import torch.nn as nn
from QueryEncoder import qEncoder
from PassageEncoder import psEncoder
from CSA import csa
from torch.autograd import Variable
from torchtext import data
from torchtext.vocab import GloVe
import numpy as np
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', default=2, type=int)
parser.add_argument('--gpu', default=torch.device('cpu' if torch.cuda.is_available() else 'cpu'), type=int)
parser.add_argument('--csa-mode',default='mul',type = str)
parser.add_argument('--word-dim',default=300,type = int)
parser.add_argument('--block-size', default=-1, type=int)
parser.add_argument('--mSA-scalar', default=5.0, type=float)

#parser.add_argument('--tokenizer', default='',type = str)
args = parser.parse_args(args=[])


importing Jupyter notebook from mydata.ipynb
importing Jupyter notebook from QueryEncoder.ipynb
importing Jupyter notebook from PassageEncoder.ipynb
importing Jupyter notebook from CSA.ipynb


In [2]:
class customizedModule(nn.Module):
    def __init(self):
        super(customizedModule,self).__init()
    def customizedLinear(self,in_dim,out_dim,activation=None,dropout=False):
        c1 = nn.Sequential(nn.Linear(in_dim,out_dim))
        nn.init.xavier_uniform_(c1[0].weight)
        nn.init.constant_(c1[0].bias,0)
        
        if activation is not None:
            c1.add_module(str(len(c1)),activation)
        if dropout:
            c1.add_module(str(len(c1)),nn.Dropout(p=self.args.dropout))  
        return c1

In [3]:
class CSATransformer(customizedModule):
    def __init__(self, args, data):
        super(CSATransformer, self).__init__()

        self.args = args
        self.data = data
       
        self.word_emb = nn.Embedding(len(data.PASSAGE.vocab.vectors), len(data.PASSAGE.vocab.vectors[0]))
        
        # initialize word embedding with GloVe
        self.word_emb.weight.data.copy_(data.PASSAGE.vocab.vectors)
        
        # fine-tune the word embedding
        self.word_emb.weight.requires_grad = True
        
        # index for <sep>
        self.sep_index = data.PASSAGE.vocab.stoi['<sep>']
                
            
        #(self,word_dim,n_head,n_hid,dropout,nlayers):    
        # model list
        self.p_encoder = psEncoder(args)
        self.q_encoder = qEncoder(300,4,300,0.1,2)
        self.csa = csa(args,args.word_dim, args.word_dim)
        self.decoder =  nn.Sequential(nn.Linear(args.word_dim, 1),nn.Sigmoid())
        
    def batch_init(self,batch):
        # transpose batch data to [batchsize*passage_index]
        batch.Question = batch.Question.transpose(0,1)
        batch.Answer = batch.Answer.transpose(0,1)
        batch.Passage = batch.Passage.transpose(0,1)       
        return batch
    
    # input a [passage_len] tensor represent a passage
    # padding to it's passage max length
    def to_block(self,passage,mlen):
        #print('we going to block!',mlen)
        t_list = []
        nt = passage.to('cpu').numpy()
        sep_index = np.where(nt == self.sep_index)[0]
        pre_index = 0
        for i,s in enumerate(sep_index):      
            slen = s - pre_index -1    
            pad_len = mlen - slen
            if pad_len < 0:
                print('slen<0! = ',sep_index,pre_index,s)
            pad = Variable(torch.zeros(pad_len).long()).to(self.args.gpu).detach()
            #print('pad is:',pad,pad.shape)
            # tensor of sentence
            if i is 0:
                s_t = passage.narrow(0,0,slen)
            else:
                s_t = passage.narrow(0,pre_index+1,slen)
            #print('p',s_t.shape,'pad',pad.shape)    
            t_list.append(torch.cat([s_t,pad]))
            pre_index = s    
        blocks = torch.stack(t_list,dim = 0)  
        #print('block:',blocks.shape)
        return blocks
    
    #return the max sentence length in a passage
    #p is a [1*passagelength] tensor
    def maxPassageSL(self,passage):
        p_numpy = passage.to('cpu').numpy()
        sep_index = np.where(p_numpy == self.sep_index)[0]
        pre_index = 0
        mlen = 0 
        for s in sep_index:         
            senlen = s - pre_index
            if senlen > mlen:
                mlen = senlen
            pre_index = s 
        return mlen
    
    def forward(self, batch):
          
        batch = self.batch_init(batch)
        pred = []
        for i in range(0,batch.batch_size):    
            p = batch.Passage[i] # p (passage_len)
            p = self.to_block(p,self.maxPassageSL(p)).unsqueeze(0) # p (batch(1),block,passage_index)
            p = self.word_emb(p) # p (batch(1),block,passage_len,word_dim)
            q = batch.Question[i] # a tensor [passage_length]
            q = self.word_emb(q).unsqueeze(0) # q (question_length,word_dim)
            print('P:{} Q {}'.format(p.shape,q.shape))
            q = self.q_encoder(q)
            p = self.p_encoder(p) 
            c = self.csa(p,q)
            res = self.decoder(c).squeeze()
            #print('After encode: P {} Q {} RES {}'.format(p.shape,q.shape,res.shape))
            print(res)
            pred.append(res)
        return pred

In [4]:
trainpath = './small_train_sep_100.csv'
devpath = './small_dev_sep_100.csv'
#trainpath = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/train.csv'
#devpath = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/dev.csv'
m = getHotpotData(args,trainpath,devpath)
print('done')
model = CSATransformer(args,m).to(args.gpu)
iterator = m.train_iter

done


In [38]:
def fixlabel(batch,dit):
    #probality distribute 
    b = batch.Label.transpose(0,1) # batchsize*tensor
    #print(b)
    result = []
    for index_tensor in b:
        p = [0]*100
        p_numpy = index_tensor.to('cpu').numpy()
        for idx in p_numpy:    
            act_idx = dit.itos[idx]
            if act_idx=='<pad>':
                break
            p[int(act_idx)] = 1
        result.append(torch.Tensor(p))
    return result

for i,batch in enumerate(iterator):
    print(i,'====================')   
    pred = fixlabel( batch,m.LABEL.vocab)
    print(pred)

<class 'torchtext.data.iterator.BucketIterator'>
[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]
[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

# trans data

In [9]:
src = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/train.json'
des = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small/small_train_sep_100.csv'
des = './small_train_sep_100.csv'
#des = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/train.csv'
dataProcesser(src,des,100)
src = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/dev.json'
des = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/small/small_dev_sep_100.csv'
des = './small_dev_sep_100.csv'
#des = 'C:/Users/User/Documents/3.NLP/Dataset/HotpotQA/dev.csv'
print('DEV')
dataProcesser(src,des,100)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


write down
DEV


0% [##############################] 100% | ETA: 00:00:00

write down



Total time elapsed: 00:00:00


<mydata.dataProcesser at 0x28206d8acf8>

In [None]:
s = [1,2,3]
x = torch.Tensor(s)
x

In [None]:
s = ['1','2','3']
x = torch.Tensor(s)
x

In [None]:
s = '0'*10
idx = 5
s = s[:idx] + '1' +  s[idx:]
s