# Convert Data into SQuAD 2.0 JSON Format

Written for SQuAD2.0 data sourced from kaggle

In [None]:
!pip install kagglehub

In [7]:
import os
import re
import json
import csv
import ast
import numpy as np

In [None]:
# Code from kaggle site for download of dataset
import kagglehub

# Download latest version; will download as a csv file
path = kagglehub.dataset_download("thedevastator/squad2-0-a-challenge-for-question-answering-syst")

print("Path to dataset files:", path)

In [None]:
print("Files in dataset folder:")
print(os.listdir(path))

# Functions

In [None]:
def parse_answers_column(answer_str):
    try:
        # clean the string to convert to dictionary
        #cleaned_str = re.sub(r"array\((\[.*?\])[^)]*\)", r"\1", answer_str)
        cleaned_str = re.sub(r"array\(\[(.*?)\](?:,\s*dtype=[^)]+)?\)", r"[\1]", answer_str)

        # Safely evaluate using numpy array support
        #answer_dict = eval(answer_str, {"array": np.array, "dtype": lambda x: x})
        # Conver string to dictionary safely
        answer_dict = ast.literal_eval(cleaned_str)

        # If both arrays are empty, it is unanswerable question
        if(
            isinstance(answer_dict,dict)
            and isinstance(answer_dict.get("text"), (list,np.ndarray))
            and len(answer_dict["text"])==0
        ):
            answers = [{'text':[], 'answer_start':[]}]
            return answers, True
        
        # Convert back to SQuAD-style list of dicts
        answers = [
            { "text": t, "answer_start":s}
            for t,s in zip(answer_dict["text"], answer_dict["answer_start"])
        ]
        return answers, False
    
    except Exception as e:
        # return an error
        print(f"Error parsing answer string: {answer_str}")
        print(e)
        return False, True

In [None]:
def csv_to_squad_json(csv_filepath,json_filepath):
    squad_data = {"data":[]}


    with open(csv_filepath,'r') as csvfile:
        # read in the CSV file as a dictionary - keys are column headers
        csv_reader = csv.DictReader(csvfile)
        
        for row in csv_reader:
            # Get the answers from the first row and parse for SQuAD format
            parsed_answers, is_impossible = parse_answers_column(row["answers"])

            if parsed_answers == False:
                continue

            # Build the QA entry
            qas = [{
                "id":row["id"],
                "question": row["question"],
                "answer": parsed_answers[0]['text'],
                "answer_start": parsed_answers[0]['answer_start'],
                "is_impossible": is_impossible
            }]
    
            # Format into SQuAD structure
            squad_data["data"].append({
                "title": row["title"],
                "paragraphs": [{
                    "context": row["context"],
                    "qas": qas
                }]
            })
   

    with open(json_filepath,'w') as jsonfile:
        # write the squad formatted data to a JSON file, indent for readability
        json.dump(squad_data, jsonfile, indent=4)

    print(f"Successfully wrote {len(squad_data['data'])} entries to {json_filepath}")

# Process

In [None]:
train_path = os.path.join(path, "train.csv")
validation_path = os.path.join(path, "validation.csv")

# Set the file path where the json file will be stored
json_filepath_train = 'Data/train.json'
json_filepath_validate = 'Data/validation.json'

### Only needs to be run once - read in data from json after that ###
# Function saves json to given filepath as a side effect
csv_to_squad_json(train_path,json_filepath_train)
csv_to_squad_json(validation_path,json_filepath_validate)