In [7]:
import pandas as pd
import json

def load_squad_to_dataframe(file_path):

    with open(file_path, 'r', encoding='utf-8') as f:
        squad_data = json.load(f)

    # This list will hold our flattened data
    rows_list = []

    # The JSON is nested, so we need to loop through it
    for topic in squad_data['data']:
        title = topic['title']
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                qa_id = qa['id']
                is_impossible = qa['is_impossible']

                # Handle answers
                answer_text = None
                if not is_impossible:
                    # Get the text of the first answer if it's not impossible
                    answer_text = qa['answers'][0]['text']

                new_row = {
                    'id': qa_id,
                    'title': title,
                    'context': context,
                    'question': question,
                    'answer_text': answer_text,
                    'is_impossible': is_impossible
                }
                rows_list.append(new_row)

    # Create the DataFrame
    df = pd.DataFrame(rows_list)
    return df


train_file_path = 'D:\LLM-Hallucination\data\SQuAD\\train-v2.0.json'





train_df = load_squad_to_dataframe(train_file_path)

print(f"--- SQuAD 2.0 Training Data ({train_file_path}) ---")

# Display the first 5 rows
print("\nFirst 5 rows (head):")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 80) # Truncate long text
train_df.head()








--- SQuAD 2.0 Training Data (D:\LLM-Hallucination\data\SQuAD\train-v2.0.json) ---

First 5 rows (head):


Unnamed: 0,id,title,context,question,answer_text,is_impossible
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4,...",When did Beyonce start becoming popular?,in the late 1990s,False
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4,...",What areas did Beyonce compete in when she was growing up?,singing and dancing,False
2,56be85543aeaaa14008c9066,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4,...",When did Beyonce leave Destiny's Child and become a solo singer?,2003,False
3,56bf6b0f3aeaaa14008c9601,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4,...",In what city and state did Beyonce grow up?,"Houston, Texas",False
4,56bf6b0f3aeaaa14008c9602,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4,...",In which decade did Beyonce become famous?,late 1990s,False


In [5]:
# Display columns and their types
print("\nColumns and Data Types (info):")
train_df.info()


Columns and Data Types (info):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130319 non-null  object
 1   title          130319 non-null  object
 2   context        130319 non-null  object
 3   question       130319 non-null  object
 4   answer_text    86821 non-null   object
 5   is_impossible  130319 non-null  bool  
dtypes: bool(1), object(5)
memory usage: 5.1+ MB


In [8]:
# --- Load and Display Development (Dev) Data ---
dev_file_path = 'D:\LLM-Hallucination\data\SQuAD\dev-v2.0.json'
dev_df = load_squad_to_dataframe(dev_file_path)

print(f"--- SQuAD 2.0 Development Data ({dev_file_path}) ---")

# Display the first 5 rows
print("\nFirst 5 rows (head):")
dev_df.head()





--- SQuAD 2.0 Development Data (D:\LLM-Hallucination\data\SQuAD\dev-v2.0.json) ---

First 5 rows (head):


Unnamed: 0,id,title,context,question,answer_text,is_impossible
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the ...,In what country is Normandy located?,France,False
1,56ddde6b9a695914005b9629,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the ...,When were the Normans in Normandy?,10th and 11th centuries,False
2,56ddde6b9a695914005b962a,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the ...,From which countries did the Norse originate?,"Denmark, Iceland and Norway",False
3,56ddde6b9a695914005b962b,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the ...,Who was the Norse leader?,Rollo,False
4,56ddde6b9a695914005b962c,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the ...,What century did the Normans first gain their separate identity?,10th century,False


In [None]:
# Display columns and their types
print("\nColumns and Data Types (info):")
dev_df.info()