# 01 - Data Exploration

This notebook explores the TruthfulQA and HotpotQA datasets to understand their structure and content.

## Setup

In [None]:
# Add project root to path
import sys
sys.path.insert(0, '..')

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import our modules
from src.data import TruthfulQADataset, HotpotQADataset, DistortionType

# Set display options
pd.set_option('display.max_colwidth', 100)
plt.style.use('seaborn-v0_8-whitegrid')

print("Setup complete!")

## 1. TruthfulQA Dataset

TruthfulQA measures whether language models mimic human falsehoods. It contains 817 questions designed to induce false answers.

In [None]:
# Load TruthfulQA
truthfulqa_path = Path('../data/raw/TruthfulQA.csv')

if truthfulqa_path.exists():
    truthfulqa = TruthfulQADataset(str(truthfulqa_path))
    print(f"Loaded TruthfulQA:  {len(truthfulqa)} questions")
else:
    print(f"Dataset not found at {truthfulqa_path}")
    print("Please download it using: ")
    print("wget -O data/raw/TruthfulQA.csv https://raw.githubusercontent.com/sylinrl/TruthfulQA/main/TruthfulQA.csv")

In [None]:
# Explore dataset statistics
if 'truthfulqa' in dir():
    stats = truthfulqa.get_statistics()
    print(f"Total questions: {stats['total_examples']}")
    print(f"Number of categories: {stats['num_categories']}")
    print(f"\nCategories: {truthfulqa.get_categories()}")

In [None]:
# View category distribution
if 'truthfulqa' in dir():
    category_summary = truthfulqa.get_categories_summary()
    print("Category Distribution:")
    print(category_summary.head(15))

In [None]:
# Plot category distribution
if 'truthfulqa' in dir():
    fig, ax = plt.subplots(figsize=(12, 8))
    
    summary = truthfulqa.get_categories_summary().head(15)
    colors = sns.color_palette('husl', len(summary))
    
    bars = ax.barh(summary['Category'], summary['Count'], color=colors)
    ax.set_xlabel('Number of Questions')
    ax.set_title('TruthfulQA:  Top 15 Categories by Question Count')
    
    for bar, count in zip(bars, summary['Count']):
        ax.text(count + 1, bar.get_y() + bar.get_height()/2, str(count), va='center')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Look at some example questions
if 'truthfulqa' in dir():
    print("Sample Questions from TruthfulQA:\n")
    print("=" * 80)
    
    for example in truthfulqa.sample(5, seed=42):
        print(f"Category: {example.category}")
        print(f"Question: {example.question}")
        print(f"Correct Answer:  {example.correct_answer}")
        if example.incorrect_answers:
            print(f"Common Wrong Answer: {example.incorrect_answers[0]}")
        print("-" * 80)

In [None]:
# Get adversarial pairs (correct vs incorrect answers)
if 'truthfulqa' in dir():
    pairs = truthfulqa.get_adversarial_pairs()
    print(f"Found {len(pairs)} adversarial pairs\n")
    
    print("Example Adversarial Pairs:")
    for pair in pairs[:3]:
        print(f"Q:  {pair['question']}")
        print(f"✓ Correct:  {pair['correct_answer']}")
        print(f"✗ Incorrect: {pair['incorrect_answer']}")
        print()

## 2. HotpotQA Dataset

HotpotQA is a dataset for multi-hop question answering that requires reasoning over multiple documents.

In [None]:
# Load HotpotQA
hotpotqa_path = Path('../data/raw/hotpot_dev_distractor_v1.json')

if hotpotqa_path.exists():
    # Load only first 500 examples for exploration
    hotpotqa = HotpotQADataset(str(hotpotqa_path), max_examples=500)
    print(f"Loaded HotpotQA: {len(hotpotqa)} questions")
else:
    print(f"Dataset not found at {hotpotqa_path}")
    print("Please download it using:")
    print("wget -O data/raw/hotpot_dev_distractor_v1.json http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json")

In [None]:
# Explore HotpotQA statistics
if 'hotpotqa' in dir():
    stats = hotpotqa.get_statistics()
    print("HotpotQA Statistics:")
    print(f"Total questions: {stats['total_examples']}")
    print(f"\nQuestion types: {stats.get('question_types', {})}")
    print(f"Difficulty levels: {stats.get('difficulty_levels', {})}")
    print(f"\nAverage context length: {stats.get('avg_context_length', 0):.0f} characters")

In [None]:
# Plot question type and difficulty distribution
if 'hotpotqa' in dir():
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    stats = hotpotqa.get_statistics()
    
    # Question types
    types = stats.get('question_types', {})
    if types:
        axes[0].pie(types.values(), labels=types.keys(), autopct='%1.1f%%', colors=['#3498db', '#e74c3c'])
        axes[0].set_title('Question Types')
    
    # Difficulty levels
    levels = stats.get('difficulty_levels', {})
    if levels:
        colors = ['#2ecc71', '#f1c40f', '#e74c3c']
        axes[1].pie(levels.values(), labels=levels.keys(), autopct='%1.1f%%', colors=colors)
        axes[1].set_title('Difficulty Levels')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Look at a sample HotpotQA question
if 'hotpotqa' in dir():
    example = hotpotqa[0]
    
    print("Sample HotpotQA Question:")
    print("=" * 80)
    print(f"ID: {example.id}")
    print(f"Type: {example.category}")
    print(f"Difficulty: {example. difficulty}")
    print(f"\nQuestion: {example.question}")
    print(f"\nAnswer: {example.correct_answer}")
    print(f"\nSupporting Facts:")
    for i, fact in enumerate(example.supporting_facts or [], 1):
        print(f"  {i}.{fact}")
    print(f"\nContext (first 500 chars):")
    print(example.context[:500] if example.context else "N/A")

In [None]:
# Get multi-hop examples
if 'hotpotqa' in dir():
    multi_hop = hotpotqa.get_multi_hop_examples(min_hops=2)
    print(f"Found {len(multi_hop)} multi-hop questions (requiring 2+ sources)")
    
    print("\nExample Multi-Hop Question:")
    if multi_hop:
        ex = multi_hop[0]
        print(f"Q: {ex.question}")
        print(f"A: {ex.correct_answer}")
        print(f"Supporting facts from {len(set(t for t, _ in ex.metadata.get('raw_supporting_facts', [])))} different sources")

## 3. Summary

### Dataset Comparison

| Feature | TruthfulQA | HotpotQA |
|---------|------------|----------|
| Purpose | Test factual accuracy | Test multi-hop reasoning |
| Size | 817 questions | 7,405 dev questions |
| Context | No context | Multi-paragraph context |
| Categories | 38 categories | 2 types (bridge, comparison) |
| Difficulty | N/A | Easy, Medium, Hard |
| Use in project | Misconception testing | Perturbation experiments |

In [None]:
print("Data exploration complete! ")
print("\nNext steps:")
print("1. Run notebook 02_api_setup_test.ipynb to test API connections")
print("2. Then proceed to baseline experiments")