In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import json

def load_triviaqa_to_dataframe(file_path):
    """
    Loads a TriviaQA JSON file into a pandas DataFrame.
    Each row represents a single question.
    Context is taken from the *first* available evidence document.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        triviaqa_data = json.load(f)

    # This list will hold our flattened data
    rows_list = []

    # Loop through each question object in the 'Data' list
    for item in triviaqa_data['Data']:
        question = item['Question']
        qa_id = item['QuestionId']

        # Get the answer text
        answer_text = None
        if 'Answer' in item and 'Value' in item['Answer']:
            answer_text = item['Answer']['Value']

        # Get the context from the first available source
        context = None

        # Check for Wikipedia-style context
        if 'EntityPages' in item and item['EntityPages']:
            if 'Context' in item['EntityPages'][0]:
                context = item['EntityPages'][0]['Context']

        # If no context, check for Web-style context
        elif 'SearchResults' in item and item['SearchResults']:
            if 'SearchContext' in item['SearchResults'][0]:
                context = item['SearchResults'][0]['SearchContext']

        new_row = {
            'id': qa_id,
            'question': question,
            'answer_text': answer_text,
            'context': context  # This will be the text of the first evidence doc
        }
        rows_list.append(new_row)

    # Create the DataFrame
    df = pd.DataFrame(rows_list)
    return df

# --- Define Your File Path ---

# !! IMPORTANT: Change this path to match your TriviaQA file in Google Drive !!
# (e.g., '/content/drive/My Drive/wikipedia-train.json' or '/content/drive/My Drive/web-train.json')
triviaqa_file_path = '/content/drive/MyDrive/LLM-T2/DATASETS/TriviaQA/unfiltered-web-dev.json'


# --- Load and Display TriviaQA Data ---
try:
    triviaqa_df = load_triviaqa_to_dataframe(triviaqa_file_path)

    print(f"--- TriviaQA Data ({triviaqa_file_path}) ---")

    # Display the first 5 rows
    print("\nFirst 5 rows (head):")
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', 80) # Truncate long text
    print(triviaqa_df.head())

    # Display columns and their types
    print("\nColumns and Data Types (info):")
    triviaqa_df.info()

except FileNotFoundError:
    print(f"Error: File not found at '{triviaqa_file_path}'")
    print("Please update the 'triviaqa_file_path' variable to the correct location.")
except Exception as e:
    print(f"An error occurred: {e}")
    print("This might be because the JSON structure is different from the expected format,")
    print("or the file path is incorrect.")

--- TriviaQA Data (/content/drive/MyDrive/LLM-T2/DATASETS/TriviaQA/unfiltered-web-dev.json) ---

First 5 rows (head):
      id  \
0   tc_2   
1  tc_13   
2  tc_33   
3  tc_40   
4  tc_49   

                                                                question  \
0                                  Who was the man behind The Chipmunks?   
1                                    What star sign is Jamie Lee Curtis?   
2  Which Lloyd Webber musical premiered in the US on 10th December 1993?   
3          Who was the next British Prime Minister after Arthur Balfour?   
4                         Who had a 70s No 1 hit with Kiss You All Over?   

          answer_text context  
0       David Seville    None  
1             Scorpio    None  
2    Sunset Boulevard    None  
3  Campbell-Bannerman    None  
4               Exile    None  

Columns and Data Types (info):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11313 entries, 0 to 11312
Data columns (total 4 columns):
 #   Column       No

In [3]:
import pandas as pd
import json

def load_triviaqa_to_dataframe(file_path):
    """
    Loads a TriviaQA JSON file into a pandas DataFrame.
    Each row represents a single question.
    Context is taken from the *first* available evidence document.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        triviaqa_data = json.load(f)

    # This list will hold our flattened data
    rows_list = []

    # Loop through each question object in the 'Data' list
    for item in triviaqa_data['Data']:
        question = item['Question']
        qa_id = item['QuestionId']

        # Get the answer text
        answer_text = None
        if 'Answer' in item and 'Value' in item['Answer']:
            answer_text = item['Answer']['Value']

        # Get the context from the first available source
        context = None

        # Check for Wikipedia-style context
        if 'EntityPages' in item and item['EntityPages']:
            if 'Context' in item['EntityPages'][0]:
                context = item['EntityPages'][0]['Context']

        # If no context, check for Web-style context
        elif 'SearchResults' in item and item['SearchResults']:
            if 'SearchContext' in item['SearchResults'][0]:
                context = item['SearchResults'][0]['SearchContext']

        new_row = {
            'id': qa_id,
            'question': question,
            'answer_text': answer_text,
            'context': context  # This will be the text of the first evidence doc
        }
        rows_list.append(new_row)

    # Create the DataFrame
    df = pd.DataFrame(rows_list)
    return df

# --- Define Your File Path ---

# !! IMPORTANT: Change this path to match your TriviaQA file in Google Drive !!
# (e.g., '/content/drive/My Drive/wikipedia-train.json' or '/content/drive/My Drive/web-train.json')
triviaqa_file_path = '/content/drive/MyDrive/LLM-T2/DATASETS/TriviaQA/unfiltered-web-test-without-answers.json'


# --- Load and Display TriviaQA Data ---
try:
    triviaqa_df = load_triviaqa_to_dataframe(triviaqa_file_path)

    print(f"--- TriviaQA Data ({triviaqa_file_path}) ---")

    # Display the first 5 rows
    print("\nFirst 5 rows (head):")
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', 80) # Truncate long text
    print(triviaqa_df.head())

    # Display columns and their types
    print("\nColumns and Data Types (info):")
    triviaqa_df.info()

except FileNotFoundError:
    print(f"Error: File not found at '{triviaqa_file_path}'")
    print("Please update the 'triviaqa_file_path' variable to the correct location.")
except Exception as e:
    print(f"An error occurred: {e}")
    print("This might be because the JSON structure is different from the expected format,")
    print("or the file path is incorrect.")

--- TriviaQA Data (/content/drive/MyDrive/LLM-T2/DATASETS/TriviaQA/unfiltered-web-test-without-answers.json) ---

First 5 rows (head):
      id                                                            question  \
0   tc_7                   Asmara international airport is in which country?   
1  tc_37  At whose concert were 11 people trampled to death in Ohio in 1979?   
2  tc_51       Andy Warhol/'s 60s exhibition featured cans of which product?   
3  tc_59             In which decade of the 20th century was Al Pacino born?   
4  tc_62                San Giusto international airport is in which county?   

  answer_text context  
0        None    None  
1        None    None  
2        None    None  
3        None    None  
4        None    None  

Columns and Data Types (info):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10832 entries, 0 to 10831
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           