In [32]:
import torch
import json
import os
from collections import Counter

In [12]:
def preprocess_multirc(in_path, out_path, split):
    lines = []
    passages, questions, answers, labels = [], [], [], []
    with open(os.path.join(in_path, split+'.jsonl'), 'r', encoding='utf-8') as f:
        for line in f.readlines():
            lines.append(json.loads(line))
    for line in lines:
        passage = line["passage"]["text"]
        for question_dict in line["passage"]["questions"]:
            question = question_dict["question"]
            for answer_dict in question_dict["answers"]:
                answer = answer_dict["text"]
                passages.append(passage)
                questions.append(question)
                answers.append(answer)
                labels.append(answer_dict['label'])     
    if split == 'val':
        split = 'valid' 
    with open(os.path.join(out_path, 'input0', split), 'w', encoding='utf-8') as f:
        for l in passages:
            f.write(l+'\n')
    with open(os.path.join(out_path, 'input1', split), 'w', encoding='utf-8') as f:
        for l in questions:
            f.write(l+'\n')
    with open(os.path.join(out_path, 'input2', split), 'w', encoding='utf-8') as f:
        for l in answers:
            f.write(l+'\n')
    with open(os.path.join(out_path, 'label', split), 'w', encoding='utf-8') as f:
        for l in labels:
            f.write(str(l)+'\n')
        
root_path = '../../transformer/datasets/superglue/MultiRC/'
output_path = os.path.join(root_path, 'processed')
if not os.path.exists(output_path):
    os.mkdir(output_path)
    os.mkdir(output_path+'/input0')
    os.mkdir(output_path+'/input1')
    os.mkdir(output_path+'/input2')
    os.mkdir(output_path+'/label')
preprocess_multirc(root_path, output_path, 'train')
preprocess_multirc(root_path, output_path, 'val')

In [48]:
def preprocess_cb(in_path, out_path, split):
    lines = []
    ps, hs, labels = [], [], []
    with open(os.path.join(in_path, split+'.jsonl'), 'r', encoding='utf-8') as f:
        for line in f.readlines():
            lines.append(json.loads(line))
    for line in lines:
        ps.append(line["premise"])
        hs.append(line["hypothesis"])
        labels.append(line["label"])
         
    if split == 'val':
        split = 'valid' 
    with open(os.path.join(out_path, 'input0', split), 'w', encoding='utf-8') as f:
        for l in ps:
            f.write(l+'\n')
    with open(os.path.join(out_path, 'input1', split), 'w', encoding='utf-8') as f:
        for l in hs:
            f.write(l+'\n')
    with open(os.path.join(out_path, 'label', split), 'w', encoding='utf-8') as f:
        for l in labels:
            f.write(str(l)+'\n')
    if split == 'train':
        with open(os.path.join(out_path, 'label', 'dict.txt'), 'w', encoding='utf-8') as f:
            label_dict = Counter(labels)
            for k, v in label_dict.items():
                f.write(k+" "+str(v))
                f.write('\n')
root_path = '../../transformer/datasets/superglue/CB/'
output_path = os.path.join(root_path, 'processed')
if not os.path.exists(output_path):
    os.mkdir(output_path)
    os.mkdir(output_path+'/input0')
    os.mkdir(output_path+'/input1')
    os.mkdir(output_path+'/label')
preprocess_cb(root_path, output_path, 'train')
preprocess_cb(root_path, output_path, 'val')

In [47]:
def cb_to_tsv(in_path, split):
    lines = []
    ps, hs, labels = [], [], []
    with open(os.path.join(in_path, split+'.jsonl'), 'r', encoding='utf-8') as f:
        for line in f.readlines():
            lines.append(json.loads(line))
    for line in lines:
        ps.append(line["premise"])
        hs.append(line["hypothesis"])
        if split != 'test':
            labels.append(line["label"])
         
    if split == 'val':
        split = 'dev' 
    with open(os.path.join(in_path, split+'.tsv'), 'w', encoding='utf-8') as f:
        if split != 'test':
            f.write('premise\thypothesis\tlabel\n')
            for i in range(len(ps)):
                f.write(ps[i]+'\t'+hs[i]+'\t'+labels[i]+'\n')
        else:
            f.write('premise\thypothesis\n')
            for i in range(len(ps)):
                f.write(ps[i]+'\t'+hs[i]+'\n')
    if split == 'dev':
        split = 'valid'            
    label_path = os.path.join(in_path, 'label')
    if not os.path.exists(label_path):
        os.mkdir(label_path)
    with open(os.path.join(label_path, split), 'w', encoding='utf-8') as f:
        for l in labels:
            f.write(str(l)+'\n')
    if split == 'train':
        with open(os.path.join(label_path, 'dict.txt'), 'w', encoding='utf-8') as f:
            label_dict = Counter(labels)
            for k, v in label_dict.items():
                f.write(k+" "+str(v))
                f.write('\n')
                
root_path = '../../transformer/datasets/superglue/CB/'
cb_to_tsv(root_path, 'train')
cb_to_tsv(root_path, 'val')
cb_to_tsv(root_path, 'test')

In [37]:
def COPA_to_tsv(in_path, split):
    lines = []
    labels = []
    questions = {'cause': 'because', 'effect': 'so'}    
    with open(os.path.join(in_path, split+'.jsonl'), 'r', encoding='utf-8') as f:
        for line in f.readlines():
            l = json.loads(line)
            p = l['premise']
            c1 = l['choice1']
            c2 = l['choice2']
            q = l['question']
            label = l['label']
            lines.append(p[:-1] + ' ' + questions[q] + ' ' + c1)
            lines.append(p[:-1] + ' ' + questions[q] + ' ' + c2)
            labels.append(label)
            labels.append(label)
    if split == 'val':
        split = 'dev' 
    with open(os.path.join(in_path, split+'.tsv'), 'w', encoding='utf-8') as f:
        if split != 'test':
            f.write('premise\thypothesis\tlabel\n')
            for i in range(len(ps)):
                f.write(ps[i]+'\t'+hs[i]+'\t'+labels[i]+'\n')
        else:
            f.write('premise\thypothesis\n')
            for i in range(len(ps)):
                f.write(ps[i]+'\t'+hs[i]+'\n')
    if split == 'dev':
        split = 'valid'            
    label_path = os.path.join(in_path, 'label')
    if not os.path.exists(label_path):
        os.mkdir(label_path)
    with open(os.path.join(label_path, split), 'w', encoding='utf-8') as f:
        for l in labels:
            f.write(str(l)+'\n')
    if split == 'train':
        with open(os.path.join(label_path, 'dict.txt'), 'w', encoding='utf-8') as f:
            label_dict = Counter(labels)
            for k, v in label_dict.items():
                f.write(k+" "+str(v))
                f.write('\n')
                
root_path = '../../transformer/datasets/superglue/CB/'
cb_to_tsv(root_path, 'train')
cb_to_tsv(root_path, 'val')
cb_to_tsv(root_path, 'test')

Counter({'a': 2, 'b': 1})

In [51]:
def BoolQ_to_tsv(in_path, split):
    lines = []
    ps, hs, labels = [], [], []
    with open(os.path.join(in_path, split+'.jsonl'), 'r', encoding='utf-8') as f:
        for line in f.readlines():
            lines.append(json.loads(line))
    for line in lines:
        ps.append(line["question"])
        hs.append(line["passage"])
        if split != 'test':
            labels.append(line["label"])
         
    if split == 'val':
        split = 'dev' 
    with open(os.path.join(in_path, split+'.tsv'), 'w', encoding='utf-8') as f:
        if split != 'test':
            f.write('question\tpassage\tlabel\n')
            for i in range(len(ps)):
                f.write(ps[i]+'\t'+hs[i]+'\t'+str(labels[i])+'\n')
        else:
            f.write('question\tpassage\n')
            for i in range(len(ps)):
                f.write(ps[i]+'\t'+hs[i]+'\n')
    if split == 'dev':
        split = 'valid'            
    label_path = os.path.join(in_path, 'label')
    if not os.path.exists(label_path):
        os.mkdir(label_path)
    with open(os.path.join(label_path, split), 'w', encoding='utf-8') as f:
        for l in labels:
            f.write(str(l)+'\n')
    if split == 'train':
        with open(os.path.join(label_path, 'dict.txt'), 'w', encoding='utf-8') as f:
            label_dict = Counter(labels)
            for k, v in label_dict.items():
                f.write(str(k)+" "+str(v))
                f.write('\n')
                
root_path = '../../transformer/datasets/superglue/BoolQ/'
BoolQ_to_tsv(root_path, 'train')
BoolQ_to_tsv(root_path, 'val')
BoolQ_to_tsv(root_path, 'test')

In [31]:
def BoolQ_to_tsv(in_path, split):
    lines = []
    ps, hs, labels = [], [], []
    with open(os.path.join(in_path, split+'.jsonl'), 'r', encoding='utf-8') as f:
        for line in f.readlines():
            lines.append(json.loads(line))
    for line in lines:
        ps.append(line["question"])
        hs.append(line["passage"])
        if split != 'test':
            labels.append(line["label"])
         
    if split == 'val':
        split = 'dev' 
    with open(os.path.join(in_path, split+'.tsv'), 'w', encoding='utf-8') as f:
        if split != 'test':
            f.write('question\tpassage\tlabel\n')
            for i in range(len(ps)):
                f.write(ps[i]+'\t'+hs[i]+'\t'+str(labels[i])+'\n')
        else:
            f.write('question\tpassage\n')
            for i in range(len(ps)):
                f.write(ps[i]+'\t'+hs[i]+'\n')
    if split == 'dev':
        split = 'valid'            
    label_path = os.path.join(in_path, 'label')
    if not os.path.exists(label_path):
        os.mkdir(label_path)
    with open(os.path.join(label_path, split), 'w', encoding='utf-8') as f:
        for l in labels:
            f.write(str(l)+'\n')
    if split == 'train':
        with open(os.path.join(label_path, 'dict.txt'), 'w', encoding='utf-8') as f:
            label_dict = Counter(labels)
            for k, v in label_dict.items():
                f.write(str(k)+" "+str(v))
                f.write('\n')
                
root_path = '../../transformer/datasets/superglue/BoolQ/'
BoolQ_to_tsv(root_path, 'train')
BoolQ_to_tsv(root_path, 'val')
BoolQ_to_tsv(root_path, 'test')

array([12, 14, 14])