# Data Cleaning Script

## Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import json

## Provide the path to the data in the folder structure set up

In [2]:
data_path = '../data/SQuAD_train-v1.1.json'

## Write a function to convert the json data into a pandas dataframe

In [3]:
def squad_json_to_dataframe(input_file_path, record_path = ['data','paragraphs','qas','answers']):
    """
    Code found at the following location: https://www.kaggle.com/code/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe
    
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    file = json.loads(open(input_file_path).read())
    # parsing different level's in the json file
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat( [m[['id','question','context']].set_index('id'), js.set_index('q_idx')], axis= 1).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    return main

In [4]:
df = squad_json_to_dataframe(data_path)

## Column Explanation:
- index: unique identifier for the question/context/answer pair
- question: question asked about the context
- context: context provided to allow for the question to be answered
- answer start: numerical value of the index at which the answer begins in the context paragraph
- text: answer to the question given the context
- c_id:  a unique id for each context paragraph, where a context paragraph can be used multiple times for different questions

In [5]:
df.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


## Investigate the resulting dataframe to clean if necessary

In [6]:
df.shape

(87599, 6)

There are 87,599 responses

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87599 entries, 0 to 87598
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         87599 non-null  object
 1   question      87599 non-null  object
 2   context       87599 non-null  object
 3   answer_start  87599 non-null  int64 
 4   text          87599 non-null  object
 5   c_id          87599 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 4.0+ MB


There are no no-null values in the dataframe

In [8]:
df.drop_duplicates(inplace = True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87599 entries, 0 to 87598
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         87599 non-null  object
 1   question      87599 non-null  object
 2   context       87599 non-null  object
 3   answer_start  87599 non-null  int64 
 4   text          87599 non-null  object
 5   c_id          87599 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 4.0+ MB


There are no duplicate rows in the dataframe

## Save the resulting, cleaned pandas dataframe

In [10]:
df.to_hdf('../data/SQuAD_Cleaned_DF.h5', key = 'df')