In [73]:
import os
import pandas as pd
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset
root_dir = '/home/intern/nas'

In [None]:
def load_atomic_answer(root_dir):
    file_train = os.path.join(root_dir, 'data', 'atomic', 'train_adv-answer.jsonl')
    file_dev = os.path.join(root_dir, 'data', 'atomic', 'dev_adv-answer.jsonl')
    json_train = pd.read_json(path_or_buf=file_train, lines=True)
    json_dev = pd.read_json(path_or_buf=file_dev, lines=True)
    train_data = []
    train_labels = json_train['correct'].tolist()

    for context, candidates, labels in zip(json_train['context'].tolist(), json_train['candidates'].tolist(), json_train['correct'].tolist()):
        train_data.append((context, candidates[int(labels)]))
    return train_data

load_atomic_answer(root_dir)

In [None]:
def load_siqa_answersheet(root_dir):
    file_path = os.path.join(root_dir, 'data', 'siqa', "train-predictions.jsonl")
    json_file = pd.read_json(path_or_buf=file_path, lines=True)
    context = [elem for elem in zip(json_file['context'].tolist(), json_file['question'].tolist())]
    answer_candidate = [cand for cand in zip(json_file['answerA'].tolist(), json_file['answerB'].tolist(), json_file['answerC'].tolist())]
    corrects = json_file['correct'].tolist()
    correct_to_label = {'A':0, 'B':1, 'C':2}
    labels = [correct_to_label[correct] for correct in corrects]
    train_data=[]
    assert len(context) == len(answer_candidate) == len(labels)
    for k in range(len(context)):
        answer = answer_candidate[k][labels[k]]
        train_data.append((context[k], answer))

    
    return train_data
load_siqa_answersheet(root_dir)

In [None]:
def load_csqa_answersheet(root_dir):

    file_path = os.path.join(root_dir, 'data', 'csqa', 'train.jsonl')
    json_file = pd.read_json(path_or_buf=file_path, lines=True)
    data = []
    answerkeys = json_file['answerKey'].tolist()
    print(answerkeys)
    answerkey_to_label = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4}
    labels = [answerkey_to_label[answerkey] for answerkey in answerkeys]
    questions = json_file['question'].tolist()
    assert len(questions) == len(labels)
    idx=0
    for sample in json_file['question']:
        question = sample['stem']
        get_idx = labels[idx]
        answer = sample['choices'][get_idx]['text']
        data.append((question, answer))
        idx+=1

    return data

In [52]:
def load_cmqa_answersheet(root_dir):
    file_path = os.path.join(root_dir, 'data', 'cmqa', "train.jsonl")
    json_file = pd.read_json(path_or_buf=file_path, lines=True)
    context = [elem for elem in zip(json_file['context'].tolist(), json_file['question'].tolist())]
    answer_candidate = [cand for cand in zip(json_file['answer0'].tolist(), json_file['answer1'].tolist(), json_file['answer2'].tolist(), json_file['answer3'].tolist())]
    labels = json_file['label'].tolist()
    assert len(context) == len(answer_candidate) == len(labels)
    train_data=[]
    for k in range(len(context)):
        answer = answer_candidate[k][labels[k]]
        train_data.append((context[k], answer))        
    return train_data

In [2]:
def load_piqa_answersheet(root_dir):
    file_path = os.path.join(root_dir, 'data', 'piqa', 'train.jsonl')
    json_file = pd.read_json(path_or_buf=file_path, lines=True)
    context = json_file['goal'].tolist()
    answer_candidate = [cand for cand in zip(json_file['sol1'].tolist(), json_file['sol2'].tolist())]
    labels = json_file['label'].tolist()
    assert len(context) == len(answer_candidate) == len(labels)
    train_data=[]
    for k in range(len(context)):
        answer = answer_candidate[k][labels[k]]
        train_data.append((context[k], answer))        
    return train_data

In [3]:
class BiEncoderDataset(Dataset):
    def __init__(self, tokenizer, input):
        self.roberta_tokenizer = tokenizer
        self.input = input
        self.context_tokenized=[]
        self.answer_tokenized=[]
        for idx in self.input:
            context = idx[0]
            answer = idx[1]
            encoded_context = self.roberta_tokenizer(context, padding=True, truncation=True, return_tensors='pt')
            encoded_answer = self.roberta_tokenizer(answer, padding=True, truncation=True, return_tensors='pt')
            self.context_tokenized.append(encoded_context)
            self.answer_tokenized.append(encoded_answer)

    def __getitem__(self, idx):
        return (self.context_tokenized[idx], self.answer_tokenized[idx])

    def __len__(self):
        return len(self.input)

In [49]:
def biencoder_batch(batch):
    
    batch_size = len(batch)
    contexts, answers = zip(*batch)

    input_ids_context=[]
    attention_mask_context=[]
    max_len_for_context=0
    for context in contexts:
        input_ids_context.append(context['input_ids'][0])
        attention_mask_context.append(context['attention_mask'][0])
        if context['input_ids'][0].shape[0] > max_len_for_context:
            max_len_for_context = context['input_ids'][0].shape[0]

    input_ids_answer=[]
    attention_mask_answer=[]
    max_len_for_answer=0
    for answer in answers:
        input_ids_answer.append(answer['input_ids'][0])
        attention_mask_answer.append(answer['attention_mask'][0])
        if answer['input_ids'][0].shape[0] > max_len_for_answer:
            max_len_for_answer = answer['input_ids'][0].shape[0]

    padded_input_ids_context=[]
    padded_input_ids_answer=[]
    padded_attention_mask_context=[]
    padded_attention_mask_answer=[]
    for input_ids, attention_mask in zip(input_ids_context, attention_mask_context):
        padding_len = max_len_for_context - input_ids.shape[0]
        if padding_len > 0:
            padded_input_ids = torch.cat([input_ids, torch.LongTensor([0] * padding_len)])
            padded_attention_mask = torch.cat([attention_mask, torch.LongTensor([0] * padding_len)])
            padded_input_ids_context.append(padded_input_ids)
            padded_attention_mask_context.append(padded_attention_mask)            
        else:
            padded_input_ids_context.append(input_ids)
            padded_attention_mask_context.append(attention_mask)

    for input_ids, attention_mask in zip(input_ids_answer, attention_mask_answer):
        padding_len = max_len_for_answer - input_ids.shape[0]
        if padding_len > 0:
            padded_input_ids = torch.cat([input_ids, torch.LongTensor([0] * padding_len)])
            padded_attention_mask = torch.cat([attention_mask, torch.LongTensor([0] * padding_len)])
            padded_input_ids_answer.append(padded_input_ids)
            padded_attention_mask_answer.append(padded_attention_mask)            
        else:
            padded_input_ids_answer.append(input_ids)
            padded_attention_mask_answer.append(attention_mask)
            
    

    batch = (torch.stack(padded_input_ids_context), torch.stack(padded_input_ids_answer), torch.stack(padded_attention_mask_context), torch.stack(padded_attention_mask_answer))
    return batch

In [33]:
import os
import random
import argparse
import numpy as np

import torch
from torch import nn
from torch.utils.data import DataLoader

from transformers import RobertaTokenizer, RobertaModel, AdamW

from sklearn.model_selection import train_test_split
from model import BiEncoder
from utils import Trainer

random.seed(42)
torch.manual_seed(42)
np.random.seed(42)
root_dir = '/home/intern/nas'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
train_data = load_piqa_answersheet(root_dir)
train_data, valid_data = train_test_split(train_data, train_size=9000, random_state=42)
train_dataset = BiEncoderDataset(tokenizer, train_data)
valid_dataset = BiEncoderDataset(tokenizer, valid_data)
print('Train Set:', len(train_dataset), 'Valid Set:', len(valid_dataset))

Train Set: 9000 Valid Set: 7113


In [43]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=lambda batch: biencoder_batch(batch))
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=True, collate_fn=lambda batch: biencoder_batch(batch))

In [50]:
for batch in train_loader:
    break

In [74]:
class BiEncoder(nn.Module):
    def __init__(self, context_bert_model, response_bert_model):
        super(BiEncoder, self).__init__()
        self.context_bert = context_bert_model
        self.response_bert = response_bert_model
    def forward(self, context_input_ids, context_input_masks,
                            responses_input_ids, responses_input_masks, labels=None):

        context = self.context_bert(input_ids = context_input_ids, attention_mask = context_input_masks)

        # print(output[1])

        response = self.response_bert(input_ids = responses_input_ids, attention_mask = responses_input_masks)

        context_vector = context[1]
        response_vector = response[1]
        dot_product = torch.matmul(context_vector, response_vector.t())  # [bs, bs]
        mask = torch.eye(context_input_ids.size(0)).to(context_input_ids.device)
        loss = F.log_softmax(dot_product, dim=-1) * mask
        loss = (-loss.sum(dim=1)).mean()
        return loss


In [75]:
roberta_model_1 = RobertaModel.from_pretrained('roberta-large')
roberta_model_2 = RobertaModel.from_pretrained('roberta-large')
model = BiEncoder(roberta_model_1, roberta_model_2)

for batch in train_loader:
    input_ids_context, input_ids_answer, attention_mask_context, attention_mask_answer =batch
    output = model(context_input_ids=input_ids_context, context_input_masks = attention_mask_context,
                    responses_input_ids=input_ids_answer, responses_input_masks=attention_mask_answer,
                    labels=None)


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaM

torch.Size([8, 21])
torch.Size([8, 73])
tensor([[ 0.1550,  0.7119,  0.4564,  ...,  0.0052,  0.4077, -0.3467],
        [ 0.1492,  0.7084,  0.4518,  ...,  0.0020,  0.4129, -0.3730],
        [ 0.1643,  0.7136,  0.4519,  ..., -0.0219,  0.4205, -0.3409],
        ...,
        [ 0.1597,  0.7124,  0.4698,  ...,  0.0025,  0.3666, -0.3821],
        [ 0.1541,  0.7165,  0.4280,  ..., -0.0080,  0.4142, -0.3530],
        [ 0.1356,  0.6615,  0.5050,  ..., -0.0038,  0.3687, -0.3477]],
       grad_fn=<TanhBackward>)
torch.Size([8, 1024])
torch.Size([8, 1024])
torch.Size([8, 20])
torch.Size([8, 30])
tensor([[ 0.1292,  0.6961,  0.4832,  ..., -0.0157,  0.4267, -0.3728],
        [ 0.1586,  0.7525,  0.3766,  ...,  0.1652,  0.2875, -0.1965],
        [ 0.1355,  0.6739,  0.4920,  ...,  0.0075,  0.3658, -0.3775],
        ...,
        [ 0.1311,  0.6977,  0.4685,  ..., -0.0158,  0.4219, -0.3668],
        [ 0.1211,  0.7507,  0.3568,  ...,  0.1454,  0.3151, -0.1933],
        [ 0.1690,  0.7148,  0.4312,  ...,  0.019

KeyboardInterrupt: 

In [46]:
a=[[1,2,3],[4,5,6]]
b = torch.stack(a)
print(b)

TypeError: expected Tensor as element 0 in argument 0, but got list