In [None]:
%cd graph-enhanced-retrieval-qa
!pwd

In [26]:
import json
import pandas as pd
from pprint import pprint 

DATA_PATH = 'data/raw/train.json'

In [33]:
with open(DATA_PATH, 'r') as f:
    dataset = json.load(f)    
print(f"Total number of samples in the dataset: {len(dataset)}")

Total number of samples in the dataset: 167454


In [36]:
sample = dataset[122347]

print("--- QUESTION ---")
print(sample['question'])
print("\n--- SUPPORTING FACTS (Ground Truth) ---")
pprint(sample['supporting_facts'])
print("\n--- CONTEXT (First 3 passages) ---")
pprint(sample['context'][:3])

--- QUESTION ---
Are both Kolyai, Kermanshah and Kharabeh-Ye Senji located in the same country?

--- SUPPORTING FACTS (Ground Truth) ---
[['Kolyai, Kermanshah', 0], ['Kharabeh-ye Senji', 0]]

--- CONTEXT (First 3 passages) ---
[['Kermanshah Airport',
  ['Shahid Ashrafi Esfahani Airport is an airport in Kermanshah, Iran.',
   'It serves the city of Kermanshah and the surrounding areas with daily '
   'domestic and seasonal international destinations.',
   'It is located in the eastern part of the city and shares its land with the '
   'Havanirooz 1st Combat Base.']],
 ['Kharabeh-ye Senji',
  ['Kharabeh- ye Senji( also Romanized as Kharābeh- ye Senjī; also known as '
   'Kharābeh) is a village in Nazluchay Rural District, Nazlu District, Urmia '
   'County, West Azerbaijan Province, Iran.',
   'At the 2006 census, its population was 681, in 95 families.']],
 ['Kharabeh-ye Sadat',
  ['Kharabeh- ye Sadat( also Romanized as Kharābeh- ye Sādāt) is a village in '
   'Saidiyeh Rural District, 

In [37]:
def process_context(context):
    """
    Processes the raw context from a single sample.
    Merges sentences under each title to form a single passage.

    Args:
        context (list): The 'context' field from a single data sample.

    Returns:
        tuple: A tuple containing:
            - passage_titles (list): A list of passage titles.
            - passages (list): A list of the corresponding full passages (merged sentences).
    """
    passage_titles = []
    passages = []

    for title, sentences in context:
        # Filter out empty sentences
        if sentences:
            full_passage = " ".join(sentences)
            passage_titles.append(title)
            passages.append(full_passage)

    return passage_titles, passages

# Let's test our function on the sample
titles, texts = process_context(sample['context'])
print("--- PROCESSED CONTEXT ---")
print(f"Found {len(titles)} passages.")
print("\nExample Title 1:", titles[0])
print("Example Passage 1:", texts[0])
print("\nExample Title 2:", titles[1])
print("Example Passage 2:", texts[1])

--- PROCESSED CONTEXT ---
Found 10 passages.

Example Title 1: Kermanshah Airport
Example Passage 1: Shahid Ashrafi Esfahani Airport is an airport in Kermanshah, Iran. It serves the city of Kermanshah and the surrounding areas with daily domestic and seasonal international destinations. It is located in the eastern part of the city and shares its land with the Havanirooz 1st Combat Base.

Example Title 2: Kharabeh-ye Senji
Example Passage 2: Kharabeh- ye Senji( also Romanized as Kharābeh- ye Senjī; also known as Kharābeh) is a village in Nazluchay Rural District, Nazlu District, Urmia County, West Azerbaijan Province, Iran. At the 2006 census, its population was 681, in 95 families.


In [39]:
def get_ground_truth_titles(supporting_facts):
    """
    Extracts the unique ground truth titles from the supporting_facts field.

    Args:
        supporting_facts (list): The 'supporting_facts' field from a single sample.

    Returns:
        set: A set of unique titles that are the ground truth for retrieval.
    """
    return {title for title, sent_idx in supporting_facts}

ground_truth_titles = get_ground_truth_titles(sample['supporting_facts'])
print("--- GROUND TRUTH TITLES ---")
print(ground_truth_titles)

--- GROUND TRUTH TITLES ---
{'Kharabeh-ye Senji', 'Kolyai, Kermanshah'}


In [40]:
# Get the set of all titles we processed from the context
all_processed_titles = set(titles)

# Check if the ground truth titles are a subset of the processed titles
is_subset = ground_truth_titles.issubset(all_processed_titles)

print(f"Are all ground truth titles present in the processed context? {is_subset}")

# If False, find out which one is missing
if not is_subset:
    missing_titles = ground_truth_titles - all_processed_titles
    print(f"Missing titles: {missing_titles}")

Are all ground truth titles present in the processed context? True


In [42]:
from src.data_loader import load_dataset, process_sample

# Load the entire dataset once
full_dataset = load_dataset('data/raw/dev.json')

# Get a single raw sample
raw_sample = full_dataset[0]

# Process it with ONE function call
processed_data = process_sample(raw_sample)

# Now you can easily access the clean data
print("QUESTION:", processed_data['question'])
print("\nPASSAGES (first 2):", list(processed_data['passages'].items())[:2])
print("\nGROUND TRUTH:", processed_data['ground_truth_titles'])

QUESTION: Who is the mother of the director of film Polish-Russian War (Film)?

PASSAGES (first 2): [('Xawery Żuławski', 'Xawery Żuławski (born 22 December 1971 in Warsaw) is a Polish film director. In 1995 he graduated National Film School in Łódź. He is the son of actress Małgorzata Braunek and director Andrzej Żuławski. His second feature "Wojna polsko-ruska" (2009), adapted from the controversial best-selling novel by Dorota Masłowska, won First Prize in the New Polish Films competition at the 9th Era New Horizons Film Festival in Wrocław. In 2013, he stated he intends to direct a Polish novel "Zły" by Leopold Tyrmand. Żuławski and his wife Maria Strzelecka had 2 children together: son Kaj Żuławski (born 2002) and daughter Jagna Żuławska (born 2009).'), ('Snow White and the Seven Dwarfs (1955 film)', 'Snow White and the Seven Dwarfs( USA:" Snow White") is a 1955 German film, directed by Erich Kobler, based on the story of Schneewittchen by the Brothers Grimm.')]

GROUND TRUTH: {'Po