# HotpotQA FullWiki Dataset Few-Shot Example Generation

This notebook imports the HotpotQA FullWiki dataset and creates few-shot examples for multi-hop question answering. We'll extract questions, relevant contexts, irrelevant contexts, and generate queries needed to retrieve missing data from the first 1000 samples.

## 1. Import Required Libraries

We'll import the necessary libraries for data processing, manipulation, and random sampling.

In [1]:
import json
import pandas as pd
import numpy as np
import random
from typing import List, Dict, Any
from datasets import load_dataset
import re
from collections import defaultdict

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Load HotpotQA FullWiki Dataset

Load the HotpotQA FullWiki dataset using the Hugging Face datasets library.

In [2]:
# Load HotpotQA FullWiki dataset
print("Loading HotpotQA FullWiki dataset...")
dataset = load_dataset("hotpot_qa", "fullwiki", trust_remote_code=True)

# Get training data
train_data = dataset['train']
print(f"Dataset loaded! Total training samples: {len(train_data)}")

# Convert first 1000 samples to a more manageable format
first_1000 = train_data.select(range(min(1000, len(train_data))))
print(f"Selected first {len(first_1000)} samples for processing")

Loading HotpotQA FullWiki dataset...


Downloading data:   0%|          | 0.00/566M [00:00<?, ?B/s]

FSTimeoutError: 

## 3. Explore Dataset Structure

Let's examine the structure of the HotpotQA dataset to understand the data format.

In [None]:
# Examine the structure of a sample
sample = first_1000[0]
print("Sample structure:")
for key, value in sample.items():
    if isinstance(value, list) and len(value) > 0:
        print(f"{key}: {type(value)} with {len(value)} items")
        if isinstance(value[0], dict):
            print(f"  First item keys: {list(value[0].keys())}")
    else:
        print(f"{key}: {type(value)} - {value}")
    print()

## 4. Randomly Select Samples

Randomly select a subset of samples from the first 1000 entries for processing.

In [None]:
# Randomly select samples for few-shot example generation
num_examples = 10  # Number of few-shot examples to generate
selected_indices = random.sample(range(len(first_1000)), num_examples)

print(f"Selected {num_examples} random samples from indices: {selected_indices}")

# Extract selected samples
selected_samples = [first_1000[i] for i in selected_indices]
print(f"Successfully extracted {len(selected_samples)} samples")

## 5. Extract Relevant and Irrelevant Contexts

For each sample, we'll identify relevant contexts (supporting facts) and create irrelevant contexts from other documents.

In [None]:
def extract_relevant_contexts(sample):
    """Extract relevant contexts based on supporting facts."""
    supporting_facts = sample['supporting_facts']
    context = sample['context']
    
    relevant_contexts = []
    
    # supporting_facts is a dict with 'title' and 'sent_id' keys
    supporting_titles = supporting_facts['title']
    supporting_sent_ids = supporting_facts['sent_id']
    
    # context is a dict with 'title' and 'sentences' keys
    context_titles = context['title']
    context_sentences = context['sentences']
    
    for title, sent_id in zip(supporting_titles, supporting_sent_ids):
        # Find the index of this title in context
        try:
            title_index = context_titles.index(title)
            sentences = context_sentences[title_index]
            
            if sent_id < len(sentences):
                relevant_contexts.append({
                    'title': title,
                    'sentence': sentences[sent_id],
                    'sentence_id': sent_id
                })
        except (ValueError, IndexError):
            # Title not found in context or sentence_id out of range
            continue
    
    return relevant_contexts

def extract_irrelevant_contexts(sample, num_irrelevant=3):
    """Extract irrelevant contexts from documents not in supporting facts."""
    supporting_facts = sample['supporting_facts']
    context = sample['context']
    
    # Get supporting titles as a set for faster lookup
    supporting_titles = set(supporting_facts['title'])
    
    context_titles = context['title']
    context_sentences = context['sentences']
    
    irrelevant_contexts = []
    
    for i, (doc_title, sentences) in enumerate(zip(context_titles, context_sentences)):
        if doc_title not in supporting_titles and sentences:
            # Take first sentence from irrelevant documents
            irrelevant_contexts.append({
                'title': doc_title,
                'sentence': sentences[0],
                'sentence_id': 0
            })
    
    # Randomly sample if we have too many
    if len(irrelevant_contexts) > num_irrelevant:
        irrelevant_contexts = random.sample(irrelevant_contexts, num_irrelevant)
    
    return irrelevant_contexts

print("Context extraction functions defined!")

## 6. Generate Queries for Missing Data

Create queries that would help retrieve the missing information needed to answer each question.

In [None]:
import re

def generate_missing_data_query(sample, relevant_contexts):
    """Generate a query to retrieve missing data needed for the question."""
    question = sample['question']
    answer = sample['answer']
    question_type = sample.get('type', 'unknown')
    
    # Extract key entities from the question
    question_lower = question.lower()
    question_words = question_lower.split()
    
    # Extract entities from relevant contexts
    entities = extract_entities_from_contexts(relevant_contexts)
    
    # Generate queries based on question type and patterns
    if question_type == 'comparison':
        # For comparison questions, create queries about the entities being compared
        if len(entities) >= 2:
            return f"{entities[0]} vs {entities[1]} comparison"
        elif entities:
            return f"{entities[0]} information details"
    
    elif 'when' in question_words or 'what year' in question_lower or 'date' in question_lower:
        # Temporal questions
        if entities:
            return f"when was {entities[0]} founded started established"
        else:
            return f"date year {' '.join(question_words[-3:])}"
    
    elif 'where' in question_words or 'location' in question_lower:
        # Location questions
        if entities:
            return f"where is {entities[0]} located"
        else:
            return f"location of {answer}"
    
    elif 'who' in question_words:
        # Person questions
        if entities:
            return f"who is {entities[0]} biography"
        else:
            return f"information about {answer}"
    
    elif 'what' in question_words:
        # General what questions
        if 'magazine' in question_lower or 'publication' in question_lower:
            if entities:
                return f"{entities[0]} magazine publication details"
        elif entities:
            return f"what is {entities[0]} definition"
    
    elif 'which' in question_words:
        # Which questions - often comparisons
        if entities:
            return f"{' '.join(entities)} comparison details"
    
    # Fallback: create a query based on the answer and question context
    if entities:
        return f"{entities[0]} {answer} information"
    else:
        return f"information about {answer}"

def extract_entities_from_contexts(contexts):
    """Extract potential entities from context titles and sentences."""
    entities = []
    
    for ctx in contexts:
        title = ctx['title']
        # Clean up titles - remove common words and extract main entities
        title_clean = re.sub(r'\(.*?\)', '', title).strip()  # Remove parentheses
        title_clean = re.sub(r"'s\b", '', title_clean)  # Remove possessive 's
        
        # Split on common separators and take meaningful parts
        parts = re.split(r'[,\-–—]', title_clean)
        for part in parts:
            part = part.strip()
            if len(part) > 2 and not part.lower() in ['the', 'and', 'or', 'of', 'in', 'for', 'to', 'a', 'an']:
                entities.append(part)
        
    # Remove duplicates while preserving order
    seen = set()
    unique_entities = []
    for entity in entities:
        if entity.lower() not in seen:
            seen.add(entity.lower())
            unique_entities.append(entity)
    
    return unique_entities[:3]  # Return first 3 unique entities

print("Query generation functions defined!")

## 7. Generate Few-Shot Examples

Combine all components to create comprehensive few-shot examples for multi-hop question answering.

In [None]:
# Generate few-shot examples
few_shot_examples = []

for i, sample in enumerate(selected_samples):
    print(f"Processing sample {i+1}/{len(selected_samples)}...")
    
    # Extract components
    question = sample['question']
    answer = sample['answer']
    question_type = sample.get('type', 'unknown')
    level = sample.get('level', 'unknown')
    
    relevant_contexts = extract_relevant_contexts(sample)
    irrelevant_contexts = extract_irrelevant_contexts(sample)
    missing_data_query = generate_missing_data_query(sample, relevant_contexts)
    
    # Create a simple query based on the question for comparison
    simple_query = question.replace('?', '').strip()
    if len(simple_query) > 50:
        # Take first part of long questions
        simple_query = ' '.join(simple_query.split()[:8]) + '...'
    
    # Create few-shot example
    few_shot_example = {
        'id': sample['id'],
        'question': question,
        'answer': answer,
        'type': question_type,
        'level': level,
        'relevant_contexts': relevant_contexts,
        'irrelevant_contexts': irrelevant_contexts,
        'query': missing_data_query,  # Main generated query
        'simple_query': simple_query,  # Simple fallback query
        'num_supporting_facts': len(sample['supporting_facts']['title']),
        'total_context_docs': len(sample['context']['title'])
    }
    
    few_shot_examples.append(few_shot_example)
    
    # Print progress info
    print(f"  Question type: {question_type}, Level: {level}")
    print(f"  Supporting facts: {len(relevant_contexts)}, Irrelevant contexts: {len(irrelevant_contexts)}")
    print(f"  Generated query: {missing_data_query}")
    print()

print(f"Generated {len(few_shot_examples)} few-shot examples!")

## 8. Display Generated Few-Shot Examples

Let's examine the generated few-shot examples to verify their quality.

In [None]:
# Display the generated few-shot examples
for i, example in enumerate(few_shot_examples[:3]):  # Show first 3 examples
    print(f"\n{'='*60}")
    print(f"EXAMPLE {i+1}")
    print(f"{'='*60}")
    
    print(f"\nID: {example['id']}")
    print(f"Type: {example['type']} | Level: {example['level']}")
    print(f"Supporting Facts: {example['num_supporting_facts']} | Total Context Docs: {example['total_context_docs']}")
    
    print(f"\nQuestion: {example['question']}")
    print(f"Answer: {example['answer']}")
    
    print(f"\n--- RELEVANT CONTEXTS ({len(example['relevant_contexts'])}) ---")
    for j, ctx in enumerate(example['relevant_contexts']):
        print(f"  {j+1}. [{ctx['title']}] (Sent {ctx['sentence_id']})")
        print(f"     {ctx['sentence']}")
        print()
    
    print(f"--- IRRELEVANT CONTEXTS ({len(example['irrelevant_contexts'])}) ---")
    for j, ctx in enumerate(example['irrelevant_contexts']):
        sentence_preview = ctx['sentence'][:100] + '...' if len(ctx['sentence']) > 100 else ctx['sentence']
        print(f"  {j+1}. [{ctx['title']}] {sentence_preview}")
    
    print(f"\n--- GENERATED QUERIES ---")
    print(f"Main Query: {example['query']}")
    print(f"Simple Query: {example['simple_query']}")

print(f"\n\nShowing 3 out of {len(few_shot_examples)} generated examples.")

## 9. Save Few-Shot Examples

Save the generated few-shot examples to a JSON file for future use.

In [None]:
# Save few-shot examples to JSON file
output_file = 'hotpotqa_fewshot_examples.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(few_shot_examples, f, indent=2, ensure_ascii=False)

print(f"Few-shot examples saved to {output_file}")
print(f"Total examples: {len(few_shot_examples)}")

# Display comprehensive summary statistics
print("\n" + "="*50)
print("SUMMARY STATISTICS")
print("="*50)

print(f"\n📊 Dataset Overview:")
print(f"- Total examples generated: {len(few_shot_examples)}")
print(f"- Average relevant contexts per example: {np.mean([len(ex['relevant_contexts']) for ex in few_shot_examples]):.2f}")
print(f"- Average irrelevant contexts per example: {np.mean([len(ex['irrelevant_contexts']) for ex in few_shot_examples]):.2f}")
print(f"- Average supporting facts per example: {np.mean([ex['num_supporting_facts'] for ex in few_shot_examples]):.2f}")
print(f"- Average total context docs per example: {np.mean([ex['total_context_docs'] for ex in few_shot_examples]):.2f}")

# Question type distribution
print(f"\n🔍 Question Types:")
question_types = defaultdict(int)
for ex in few_shot_examples:
    question_types[ex['type']] += 1

for qtype, count in sorted(question_types.items()):
    percentage = (count / len(few_shot_examples)) * 100
    print(f"  - {qtype}: {count} ({percentage:.1f}%)")

# Question level distribution  
print(f"\n📈 Difficulty Levels:")
levels = defaultdict(int)
for ex in few_shot_examples:
    levels[ex['level']] += 1

for level, count in sorted(levels.items()):
    percentage = (count / len(few_shot_examples)) * 100
    print(f"  - {level}: {count} ({percentage:.1f}%)")

# Query analysis
print(f"\n📝 Query Analysis:")
query_lengths = [len(ex['query'].split()) for ex in few_shot_examples]
print(f"- Average query length: {np.mean(query_lengths):.1f} words")
print(f"- Query length range: {min(query_lengths)}-{max(query_lengths)} words")

# Sample some queries by type
print(f"\n🔍 Sample Queries by Type:")
for qtype in question_types.keys():
    examples_of_type = [ex for ex in few_shot_examples if ex['type'] == qtype]
    if examples_of_type:
        sample_ex = examples_of_type[0]
        print(f"\n  {qtype.upper()}:")
        print(f"    Question: {sample_ex['question'][:80]}...")
        print(f"    Query: {sample_ex['query']}")

print(f"\n✅ Few-shot dataset generation completed successfully!")

In [None]:
# Create plain examples from the HotpotQA few-shot dataset
plain_examples = []

# Select up to 3 examples
for i, example in enumerate(few_shot_examples[:3]):
    # Format relevant contexts as a single string
    relevant_data = " ".join([f"[{ctx['title']}] {ctx['sentence']}" for ctx in example['relevant_contexts']])
    
    # Format irrelevant contexts as a single string
    irrelevant_data = " ".join([f"[{ctx['title']}] {ctx['sentence']}" for ctx in example['irrelevant_contexts']])
    
    # Create plain example tuple
    plain_example = {
        'question': example['question'],
        'relevant_data': relevant_data,
        'irrelevant_data': irrelevant_data,
        'query': example['query']
    }
    
    plain_examples.append(plain_example)

# Display the plain examples
print("Plain Few-Shot Examples (Question, Relevant Data, Irrelevant Data, Query):")
print("=" * 80)

for i, example in enumerate(plain_examples):
    print(f"\nExample {i+1}:")
    print(f"Question: {example['question']}")
    print(f"Relevant Data: {example['relevant_data'][:200]}...")
    print(f"Irrelevant Data: {example['irrelevant_data'][:200]}...")
    print(f"Query: {example['query']}")
    print("-" * 80)

print(f"\n✅ Created {len(plain_examples)} plain examples!")

## 10. Create Simplified Few-Shot Examples for Query Generation

Create a simplified version of the few-shot examples specifically designed for query generation training.

In [None]:
# Create simplified few-shot examples for query generation
query_fewshot_examples = []

for example in few_shot_examples:
    # Create a simplified version focused on query generation
    simplified_example = {
        'question': example['question'],
        'query': example['query'],
        'type': example['type'],
        'context': ' '.join([ctx['sentence'] for ctx in example['relevant_contexts'][:2]])  # Use first 2 relevant contexts
    }
    query_fewshot_examples.append(simplified_example)

# Save simplified examples
simplified_output_file = 'fewshot_examples.json'
with open(simplified_output_file, 'w', encoding='utf-8') as f:
    json.dump(query_fewshot_examples, f, indent=2, ensure_ascii=False)

print(f"Simplified few-shot examples saved to {simplified_output_file}")
print(f"Format suitable for query generation training: {len(query_fewshot_examples)} examples")

# Show a few examples of the simplified format
print("\n" + "="*40)
print("SIMPLIFIED FORMAT EXAMPLES")
print("="*40)

for i, ex in enumerate(query_fewshot_examples[:2]):
    print(f"\nExample {i+1}:")
    print(f"Question: {ex['question']}")
    print(f"Context: {ex['context'][:100]}...")
    print(f"Query: {ex['query']}")
    print(f"Type: {ex['type']}")

print(f"\n✅ Both detailed and simplified few-shot datasets created successfully!")
print(f"- Detailed: {output_file} ({len(few_shot_examples)} examples)")
print(f"- Simplified: {simplified_output_file} ({len(query_fewshot_examples)} examples)")

In [None]:
def format_simple_fewshot_examples(examples, num_examples=3):
    """Create simple few-shot examples with question, contexts, and query."""
    formatted_examples = []
    
    # Select random examples
    selected = random.sample(examples, min(num_examples, len(examples)))
    
    for example in selected:
        # Format relevant contexts
        relevant_contexts = []
        for ctx in example['relevant_contexts']:
            relevant_contexts.append(f"[{ctx['title']}] {ctx['sentence']}")
        
        # Format irrelevant contexts  
        irrelevant_contexts = []
        for ctx in example['irrelevant_contexts']:
            irrelevant_contexts.append(f"[{ctx['title']}] {ctx['sentence']}")
        
        # Create formatted example
        formatted_example = {
            'question': example['question'],
            'relevant_contexts': relevant_contexts,
            'irrelevant_contexts': irrelevant_contexts,
            'query': example['query']
        }
        
        formatted_examples.append(formatted_example)
    
    return formatted_examples

# Generate simple few-shot examples
simple_fewshot_examples = format_simple_fewshot_examples(fewshot_data, 3)

# Display the examples
print("Simple Few-Shot Examples:")
print("=" * 50)

for i, example in enumerate(simple_fewshot_examples):
    print(f"\nExample {i+1}:")
    print("-" * 30)
    print(f"Question: {example['question']}")
    
    print(f"\nRelevant Contexts:")
    for j, ctx in enumerate(example['relevant_contexts'], 1):
        print(f"  {j}. {ctx}")
    
    print(f"\nIrrelevant Contexts:")
    for j, ctx in enumerate(example['irrelevant_contexts'], 1):
        print(f"  {j}. {ctx}")
    
    print(f"\nQuery: {example['query']}")
    print("=" * 50)

print(f"\n✅ Generated {len(simple_fewshot_examples)} simple few-shot examples!")

FileNotFoundError: [Errno 2] No such file or directory: 'hotpotqa_fewshot_examples.json'