In [21]:
import os
import random
import argparse
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from transformers import RobertaTokenizer, AdamW, RobertaModel

from sklearn.metrics import classification_report, accuracy_score

from dataset import load_piqa, load_siqa, load_csqa, load_cmqa, load_piqa
from dataset import SocialiqaDataset, CommonsenseqaDataset, CosmosqaDataset, PhysicaliqaDataset
from model import Multiple_Choice_Model
from utils import get_best_model, test


random.seed(42)
torch.manual_seed(42)
np.random.seed(42)


def parser_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cur_dir', type=str, default='/home/chaehyeong/CKL')
    parser.add_argument('--root_dir', type=str, default='/home/chaehyeong/nas')

    parser.add_argument('--lm', type=str, required=True, choices=['roberta-large', 'roberta-cskg'], help='Pre-trained LM or KG fine-tuned LM')
    parser.add_argument('--pre_task', type=str, default=None)
    parser.add_argument('--cur_task', type=str, required=True, choices=['siqa', 'csqa', 'cmqa', 'piqa'])
    parser.add_argument('--training_size', type=float, required=True, help='Training data size for fine-tuning LM')
    parser.add_argument('--target_task', type=str, required=True, choices=['siqa', 'csqa', 'cmqa', 'piqa'], help='Which QA dataset to use for evaluating LM')

    parser.add_argument('--batch_size', type=int, required=True)
    args = parser.parse_args()
    return args

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# dataset and dataloader
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

In [24]:
def load_siqa_copy(root_dir, mode):
    if mode == 'train':
        file_path = os.path.join(root_dir, 'data', 'siqa', "train-predictions.jsonl")
    elif mode == 'dev':
        file_path = os.path.join(root_dir, 'data', 'siqa', "dev-predictions.jsonl")
    json_file = pd.read_json(path_or_buf=file_path, lines=True)
    data = [elem for elem in zip(json_file['context'].tolist(), json_file['question'].tolist(), json_file['answerA'].tolist(), json_file['answerB'].tolist(), json_file['answerC'].tolist())]
    corrects = json_file['correct'].tolist()

    correct_to_label = {'A':0, 'B':1, 'C':2}
    labels = [correct_to_label[correct] for correct in corrects]

    
    return data, labels

In [25]:
target_names = ['Answer A', 'Answer B', 'Answer C']
test_texts, test_labels = load_siqa_copy('/home/intern/nas', 'dev')
print(test_texts[0])
print(test_labels[0])

("Tracy didn't go home that evening and resisted Riley's attacks.", 'What does Tracy need to do before this?', 'make a new plan', 'Go home and see Riley', 'Find somewhere to go')
2


In [None]:
class SocialiqaDataset_copy(Dataset):
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (context, question, answer1, answer2, answer3)
        # y: list of indices of the correct answer
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y
        self.x_tokenized = []
        for point in self.x:
            input_answers = [point[2], point[3], point[4]]
            num_choices = len(input_answers)
            input_context_question = [point[0] + self.roberta_tokenizer.sep_token + point[1]] * num_choices
            encoded_text_train = self.roberta_tokenizer(input_context_question, input_answers, padding=True, return_tensors='pt')
            self.x_tokenized.append(encoded_text_train)

    def __getitem__(self, idx):
        return (self.x_tokenized[idx], self.y[idx])

    def __len__(self):
        return len(self.x)

In [None]:
for x in 