In [2]:
import pandas as pd 
import json

In [3]:
def unpack_json(data, column):
    contexts = []
    questions = []
    answers_text = []
    answers_start = []
    for i in range(data.shape[0]):
        topic = data.iloc[i,column]['paragraphs']
        for sub_para in topic:
            for q_a in sub_para['qas']:
                if(q_a['answers']):
                    questions.append(q_a['question'])
                    answers_start.append(q_a['answers'][0]['answer_start'])
                    answers_text.append(q_a['answers'][0]['text'])
                    contexts.append(sub_para['context'])   
    return contexts, questions, answers_text, answers_start

In [12]:
def build_dataset(contexts, questions, dest_file, drop):
    
    clean_contexts = []
    for item in contexts:
        clean_contexts.append(item.replace("\n", ""))
    
    lists = {'context':clean_contexts, 'questions':questions}
    gpt_data = pd.DataFrame(lists)
    
    # ie if this is a testing data file then drop duplicate texts
    if(drop == 1):
        gpt_data = gpt_data.drop_duplicates(subset='context', keep='first')
    
    gpt_data['combined'] = gpt_data.agg('<|startoftext|> [CONTEXT]: {0[context]} [QUESTION]: {0[questions]} <|endoftext|>\n'.format, axis=1)

    str_gpt2 = ''.join(map(str, gpt_data.combined.values))
    
    f = open(dest_file, 'w')

    f.write(str_gpt2)
    

## SQuAD Pre-processing

In [13]:
training = json.load(open('data/SQUAD/train-v2.0.json'))
train_df = pd.DataFrame(training)

testing = json.load(open('data/SQUAD/dev-v2.0.json'))
test_df = pd.DataFrame(testing)

In [14]:
contexts, questions, answers_text, answers_start = unpack_json(train_df, 1)

In [15]:
build_dataset(contexts, questions, "data/squad_train.txt", 0)

In [16]:
contexts, questions, answers_text, answers_start = unpack_json(test_df, 1)

In [17]:
build_dataset(contexts, questions, "data/squad_test.txt", 1)

## QuAC Pre-processing 

In [18]:
training = json.load(open('data/QUAC/train_v0.2_edited.json'))
train_df = pd.DataFrame(training)

testing = json.load(open('data/QUAC/val_v0.2_edited.json'))
test_df = pd.DataFrame(testing)

In [19]:
contexts, questions, answers_text, answers_start = unpack_json(train_df, 1)

In [20]:
build_dataset(contexts, questions, "data/quac_train.txt", 0)

In [21]:
contexts, questions, answers_text, answers_start = unpack_json(test_df, 1)

In [22]:
build_dataset(contexts, questions, "data/quac_test.txt", 1)