In [1]:
import os
import json 
import pandas as pd
import csv
from pathlib import Path
import ast
import random


# Used to identify answer text in original dataset. 
answer_index = {
    "A" : 0,
    "B" : 1,
    "C" : 2,
    "D" : 3,
}


def cmu_conversion_format(parent_dir_path:str) -> None:
    """
Function that takes as input the path to either the Train, Dev or Test Folder of the RACE dataset. It then
performs a search down its subdirectories and finds all text files of the RACE dataset. 

Referring to the format used in Yifan Gao's work, we transform the dataset into a large JSON file containing a list 
of dictionaries. Each of the dictionaries has the following keys: article; question; answer_text; distractor. And each key 
holds one element, i.e. for each dictionary you'll have the true answer and a corresponding distractor. 

For the purpose of structure:
    # INPUT (CMU Dataset) keys: answers,options,questions,article,id
    # OUTPUT (Our New Dataset) keys: article, question, distractor, answer_text
    """
    def sample_random_file(parent_dir_path:str) -> dict:
        """
    Helper function that picks a file at random from the RACE dataset and returns its contents.    
        """
        random_file = random.sample([path for path in Path(parent_dir_path).rglob('*.txt')], 1)
        with open(random_file[0], 'r') as f:
            ctnt = f.read()
            content_dictionary = ast.literal_eval(ctnt)
        return content_dictionary

    def remove_elem_from_list_by_id(input_list:list, idx:int)-> list:
        """
    Helper function that removes an element from a list and returns that updated list. 
    eg. [A, B, C, D] -> Remove element at index 1 -> [A, C, D]
        """
        return input_list[:idx] + input_list[idx+1:]

    f_name = "RACE_"+os.path.basename(parent_dir_path) + "_new.json"
    distractors_list=[]
    with open(f_name, 'w') as f_out:
        f_out.write('[')
        for sample in Path(parent_dir_path).rglob("*.txt"):
            with open(sample, 'r') as f:
                sample_content = f.read()
                sample_dict = ast.literal_eval(sample_content)

                for i in range(len(sample_dict["questions"])):
                    answer_idx = answer_index[sample_dict["answers"][i]]
                    distractors = remove_elem_from_list_by_id(sample_dict["options"][i], answer_idx)
                    for j in range(len(distractors)):
                        one_distractor_dict = {}
                        one_distractor_dict["article"] = sample_dict["article"].replace(" ", ",")
                        one_distractor_dict["question"] = sample_dict["questions"][i].replace(" ", ",")
                        one_distractor_dict["answer_text"] = sample_dict["options"][i][answer_idx].replace(" ", ",")
                        one_distractor_dict["distractor"] = distractors[j].replace(" ", ",")
                        distractors_list.append(one_distractor_dict)
                        f_out.write(json.dumps(one_distractor_dict))
                        f_out.write(',\n')
        f_out.write(']')
        f_out.close()
    df = pd.DataFrame(distractors_list)
    return df






In [2]:
df=cmu_conversion_format("/cluster/home/fgonzalez/datagen/RACE/dev")

In [3]:
df

Unnamed: 0,article,question,answer_text,distractor
0,"Lohri,is,an,agricultural,festival,,filled,with...","All,of,following,are,true,about,Dulha,Bhatti,E...","he,was,the,actor,who,played,Robin,Hood","he,often,helped,poor,people"
1,"Lohri,is,an,agricultural,festival,,filled,with...","All,of,following,are,true,about,Dulha,Bhatti,E...","he,was,the,actor,who,played,Robin,Hood","he,was,a,different,kind,of,thief"
2,"Lohri,is,an,agricultural,festival,,filled,with...","All,of,following,are,true,about,Dulha,Bhatti,E...","he,was,the,actor,who,played,Robin,Hood","he,is,respected,by,people,in,Punjab"
3,"Lohri,is,an,agricultural,festival,,filled,with...","We,know,from,the,passage,that,Lohri,,,_,,,.","has,the,same,name,as,the,munchies,that,the,chi...","has,nothing,to,do,with,agriculture"
4,"Lohri,is,an,agricultural,festival,,filled,with...","We,know,from,the,passage,that,Lohri,,,_,,,.","has,the,same,name,as,the,munchies,that,the,chi...","is,celebrated,both,indoors,and,outdoors"
...,...,...,...,...
14656,"October,1st,,2011\nDear,Ann,\nI,hope,that,you,...","Ann,and,her,children,are,going,to,Linda's,home...","by,train","by,car"
14657,"October,1st,,2011\nDear,Ann,\nI,hope,that,you,...","Ann,and,her,children,are,going,to,Linda's,home...","by,train","on,foot"
14658,"October,1st,,2011\nDear,Ann,\nI,hope,that,you,...","From,the,letter,we,know,that,Tom,loves,,_,,and...","sports;,animals","music;,animals"
14659,"October,1st,,2011\nDear,Ann,\nI,hope,that,you,...","From,the,letter,we,know,that,Tom,loves,,_,,and...","sports;,animals","sports;,dancing"


In [22]:
def main():
    cmu_conversion_format("~/datagen/RACE/dev")
    cmu_conversion_format("~/datagen/RACE/test")
    cmu_conversion_format("~/datagen/RACE/train")

In [23]:
if __name__ == "__main__":
    main()

~/datagen/RACE/dev
~/datagen/RACE/test
~/datagen/RACE/train


In [31]:
parent_dir_path="/cluster/home/fgonzalez/datagen/RACE/dev"

In [33]:

print(parent_dir_path)
for sample in Path(parent_dir_path).rglob("*.txt"):
    print(sample)

/cluster/home/fgonzalez/datagen/RACE/dev
/cluster/home/fgonzalez/datagen/RACE/dev/high/20516.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/8800.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/8982.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/11103.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/11062.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/21782.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/14441.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/10160.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/8789.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/8238.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/11279.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/14197.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/15182.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/9813.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/5870.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/7056.txt
/cluster/home/fgonzalez/datagen/RACE/dev/high/19332.txt
/cluster/home/

In [26]:
!pwd

/cluster/home/fgonzalez/datagen/distraction_generation
