# Data Processing

In [1]:
import json         # ms marco dataset in json format
import itertools    # for dictionary slicing
import re           # regular expression
import os           # for file reading and writing

## Development Data

### Dev Data with size of 5000

$./Data/dev\_v2.1.json$ file is the original development data set from ms marco website.
However, it is too huge for development with cpu. For faster computation, use following scripts to get the smaller dev dataset.

In [2]:
data_size = 5000     # set the data size

In [3]:
with open('./Data/original_dataset/dev_v2.1.json', 'r') as f:
    data = json.load(f)    # load json file and save in variable data

In [4]:
# query_id
query_id = dict(itertools.islice(data["query_id"].items(), data_size))

# query (questions)
query = dict(itertools.islice(data["query"].items(), data_size))

# query_type (e.g. numeric, description)
query_type = dict(itertools.islice(data["query_type"].items(), data_size))

# passages -- 10 for each query
passages = dict(itertools.islice(data["passages"].items(), data_size))

# answers
answers = dict(itertools.islice(data["answers"].items(), data_size))

# wellFormedAnswers for intermediate tasks
wellFormedAnswers = dict(itertools.islice(data["wellFormedAnswers"].items(), data_size))

In [5]:
# reformat the data
json_data = {"query_id": query_id,
             "query": query,
             "query_type": query_type,
             "passages": passages,
             "answers": answers,
             "wellFormedAnswers": wellFormedAnswers} # wellFormedAnswers are for intermediate tasks

In [6]:
# save data into separate file
with open('./Data/devData_' + str(data_size) + '.json', 'w') as out_f:
    json.dump(json_data, out_f, indent=4, sort_keys=True)

### Dev Data without "No Answer Present."
which means, answerable Data from the original development dataset

In [7]:
# answers with ANSWER not ["No Answer Present."]
have_answers = {k: v for (k, v) in data["answers"].items() if v != ["No Answer Present."]}

# take query_id, query, query_type, passages of data with answers
have_query_id = {k: v for (k, v) in data["query_id"].items() if k in have_answers.keys()}
have_query = {k: v for (k, v) in data["query"].items() if k in have_answers.keys()}
have_query_type = {k: v for (k, v) in data["query_type"].items() if k in have_answers.keys()}
have_passages = {k: v for (k, v) in data["passages"].items() if k in have_answers.keys()}

In [8]:
# reformat the data
have_answers_data = {"query_id": have_query_id,
                     "query": have_query,
                     "query_type": have_query_type,
                     "passages": have_passages,
                     "answers": have_answers}

In [9]:
# save data into './Data/devData_answerable.json' file
with open('./Data/devData_answerable.json', 'w') as out_f:
    json.dump(have_answers_data, out_f, indent=4, sort_keys=True)

## Prediction data
following cells should be run after the prediction mentioned in the $README.ipython$. Check the presence of prediction.json in $./MS\_marco/$.

#### '$./prediction.json$' file structure

In [16]:
# from './prediction.json' file, extract query_ids and answers 
with open('./prediction.json', 'r') as pred_f:
    ids = []
    answer = []
    for line in pred_f:
        # split the line to read query_id and answer
        component = re.findall(r"'(?:[^'\\]|\\.)*'", line)
        qid = component[0]
        if len(component) > 1:
            sent = component[1]
        else:
            remain = line[line.find(' '):]
            sent = re.findall(r'\"(.+?)\"', remain)[0]

        qid = qid[1:-1]
        # reformat the answer with double quotes single quote will cause an error during parsing
        sent = '"' + sent[1:-1] + '"'
        ids.append(qid)
        answer.append(sent)

In [14]:
with open('./Data/devData_' + str(data_size) + '.json', 'r') as f:
    ref = json.load(f)
    ref_id = ref["answers"].keys()
    ref_ans = ref["answers"].values()

# only take the reference answer from the query that acutally have been predicted
rid = []
ranswers = []
for id in ids:
    rid.append(id)
    ranswers.append(ref["answers"][str(id)])

temp = []
for x, y in zip(rid, ranswers):
    y = '["' + str(y[0]) + '"]'
    temp.append('{"query_id": %d, "answers": %s}' % (int(x), y))

# write the reference file
#############
# BEFORE RUNNING THIS CELL DELETE THE './Data/reference.json'
# It is because this script will append the data into the existing file
#############
with open('./Data/reference.json', 'a') as the_file:
    for i in range(len(temp)):
        the_file.write(temp[i])
        the_file.write('\n')

In [15]:
temp = []
for x, y in zip(ids, answer):
    y = '[' + y + ']'
    temp.append('{"query_id": %d, "answers": %s}' % (int(x), y))

# write the candidate file
#############
# BEFORE RUNNING THIS CELL DELETE THE './Data/candidate.json'
# It is because this script will append the data into the existing file
#############
with open('./Data/candidate.json', 'a') as the_file:
    for i in range(len(temp)):
        the_file.write(temp[i])
        the_file.write('\n')