In [1]:
import json
import torch
from torch.utils.data import Dataset

path = "../dataset/topiocqa_train_rel_label.json"

In [2]:
with open(path, 'r') as f:
    data = f.readlines()

In [3]:
example = data[0]
e_dict = json.loads(example)

In [4]:
e_dict

{'id': '1-2-1',
 'query': 'was the battle fought in australia?',
 'rel_query': "what was australia's contribution to the battle of normandy?",
 'rel_label': 1}

In [5]:
from transformers import BertModel, BertTokenizer
import torch
bert_path = "../../bertmodel"
tokenizer = BertTokenizer.from_pretrained(bert_path)
model = BertModel.from_pretrained(bert_path)

Some weights of the model checkpoint at ../../bertmodel were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
inputs = tokenizer(e_dict['query'], return_tensors="pt")
outputs = model(**inputs)

In [17]:
inputs

{'input_ids': tensor([[ 101, 2001, 1996, 2645, 4061, 1999, 2660, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
inputs['input_ids']

tensor([[ 101, 2001, 1996, 2645, 4061, 1999, 2660, 1029,  102]])

In [22]:
def padding_seq_to_same_length(input_ids, max_pad_length, pad_token = 0):
    padding_length = max_pad_length - len(input_ids)
    padding_ids = [pad_token] * padding_length
    attention_mask = []

    if padding_length <= 0:
        attention_mask = [1] * max_pad_length
        input_ids = input_ids[:max_pad_length]
    else:
        attention_mask = [1] * len(input_ids) + [0] * padding_length
        input_ids = input_ids + padding_ids
            
    assert len(input_ids) == max_pad_length
    assert len(attention_mask) == max_pad_length
  
    return input_ids, attention_mask

In [7]:
import torch
import tqdm
from torch.utils.data import Dataset
import json
from transformers import BertTokenizer


def padding(input_dict: dict, max_pad_len: int, pad_token=0):
    input_ids = input_dict['input_ids'].reshape(-1,)
    attention_mask = input_dict['attention_mask'].reshape(-1,)
    padding_len = max_pad_len - len(input_ids)
    padding_ids = torch.tensor([pad_token] * padding_len)
    input_ids = torch.cat((input_ids, padding_ids), 0)
    attention_mask = torch.cat((attention_mask, padding_ids), 0)
    return {'input_ids': input_ids.long(), 'attention_mask': attention_mask.long()}


class CRDataset(Dataset):
    
    def __init__(self, fpath: str, bpath: str):
        """ 
        fpath: dataset file path
        bpath: path stored BertModel and BertTokenizer
        """
        tokened = []
        max_token_len_q, max_token_len_r = 0, 0
        tokenizer = BertTokenizer.from_pretrained(bpath)
        with open(fpath, 'r') as f:
            lines = f.readlines()[:5]
        
        for line in lines:
            l_dict = json.loads(line)
            """
            {'id': , 'query': str, 'rel_query': str, 'rel_label': int}
            """
            if l_dict['rel_label'] == 1:
                q = tokenizer(l_dict['query'], return_tensors='pt')
                r = tokenizer(l_dict['rel_query'], return_tensors='pt')
                q_len = q['input_ids'].size(1)
                r_len = r['input_ids'].size(1)
                tokened.append({
                    'query': q,
                    'rel_query': r
                })
                max_token_len_q = q_len if max_token_len_q < q_len else max_token_len_q
                max_token_len_r = r_len if max_token_len_r < r_len else max_token_len_r
        
        self.q_data, self.r_data = [], []
        for each in tokened:
            self.q_data.append(padding(each['query'], max_token_len_q))
            self.r_data.append(padding(each['rel_query'], max_token_len_r))

    def __len__(self):
        return len(self.q_data)

    def __getitem__(self, idx):
        return self.q_data[idx], self.r_data[idx]

In [8]:
path = "../dataset/topiocqa_train_rel_label.json"
bert_path = "../../bertmodel"
a = CRDataset(path, bert_path)

In [9]:
from torch.utils.data import DataLoader
dataloader = DataLoader(a, batch_size=64, shuffle=True)
for i in dataloader:
    q, r = i[0], i[1]
    o1, o2 = model(**q), model(**r)
    break

In [12]:
model(**q)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0637,  0.2949, -0.1112,  ..., -0.1946,  0.5498,  0.2801],
         [-0.0561, -0.3334, -0.4881,  ..., -0.1952,  0.7908, -0.4225],
         [ 0.5326, -0.1612,  0.1917,  ...,  0.1072,  0.1583,  0.3446],
         ...,
         [-0.0290, -0.2623, -1.0503,  ...,  0.2627,  0.4326, -0.6171],
         [ 0.6830,  0.0588, -0.3071,  ..., -0.0451, -0.6085, -0.1968],
         [ 0.2224,  0.3453,  0.1382,  ...,  0.3099,  0.4229, -0.1855]],

        [[-0.3621, -0.1404, -0.3945,  ..., -0.2740,  0.3580,  0.4217],
         [-0.0554, -1.2396, -0.3728,  ...,  0.1045,  0.2866,  0.3871],
         [-0.8824, -1.0056, -0.4195,  ...,  0.0063, -0.0768, -0.4187],
         ...,
         [-0.3328,  0.3367, -0.3125,  ..., -0.2629, -0.4598, -0.1589],
         [-0.1430, -0.7991, -0.5618,  ..., -0.0060,  0.5460, -0.4408],
         [ 0.6047,  0.0930, -0.4777,  ..., -0.0476, -0.5917, -0.3552]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_ou

In [53]:
inputs

{'input_ids': tensor([[ 101, 2001, 1996, 2645, 4061, 1999, 2660, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [58]:
r['input_ids'].shape

torch.Size([64, 1, 29])