# 03 - Data Preparation for Training

This notebook prepares the synthetic BRD data for fine-tuning.

## What we'll do:
1. Load the generated BRD dataset
2. Format data for instruction tuning
3. Create train/validation/test splits
4. Validate data quality
5. Create Pydantic schemas
6. Save formatted datasets

## 1. Load Generated Dataset

In [None]:
import json
import pandas as pd
from pathlib import Path
import random
from typing import Dict, List
from pydantic import BaseModel, Field, field_validator
from sklearn.model_selection import train_test_split

# Set random seed
random.seed(42)

# Find the most recent complete dataset
data_dir = Path("../data/synthetic_brds")
dataset_files = list(data_dir.glob("complete_dataset_*.json"))

if not dataset_files:
    raise FileNotFoundError("No complete dataset found. Run 02_data_generation.ipynb first.")

# Load most recent
latest_file = max(dataset_files, key=lambda p: p.stat().st_mtime)
print(f"Loading dataset: {latest_file.name}")

with open(latest_file, "r") as f:
    dataset = json.load(f)

print(f"✓ Loaded {len(dataset)} samples")

## 2. Define Pydantic Schema

This schema will be used for validation and later with Pydantic AI.

In [None]:
class ProjectEstimation(BaseModel):
    """Schema for project estimation extraction from BRDs."""
    
    effort_hours: float = Field(
        gt=0,
        description="Total project effort in hours"
    )
    timeline_weeks: int = Field(
        gt=0,
        le=520,  # Max 10 years
        description="Project timeline in weeks"
    )
    cost_usd: float = Field(
        gt=0,
        description="Estimated project cost in USD"
    )
    
    @field_validator('timeline_weeks')
    @classmethod
    def validate_timeline(cls, v):
        if v > 104:  # 2 years
            raise ValueError('Timeline exceeds reasonable range for typical projects')
        return v
    
    @field_validator('cost_usd')
    @classmethod
    def validate_cost(cls, v, info):
        effort = info.data.get('effort_hours')
        if effort and v / effort < 10:
            raise ValueError('Cost per hour too low (minimum $10/hour)')
        return v

# Test the schema
test_estimation = ProjectEstimation(
    effort_hours=480.0,
    timeline_weeks=12,
    cost_usd=48000.0
)

print("Pydantic Schema:")
print(test_estimation.model_dump_json(indent=2))
print("\n✓ Schema defined and validated")

## 3. Validate All Labels

Ensure all ground truth labels pass our Pydantic validation.

In [None]:
print("Validating all labels...\n")

valid_samples = []
invalid_samples = []

for sample in dataset:
    try:
        # Validate with Pydantic
        ProjectEstimation(**sample["labels"])
        valid_samples.append(sample)
    except Exception as e:
        invalid_samples.append({
            "sample_id": sample["id"],
            "error": str(e),
            "labels": sample["labels"]
        })

print(f"Valid samples: {len(valid_samples)}")
print(f"Invalid samples: {len(invalid_samples)}")

if invalid_samples:
    print("\nFirst few invalid samples:")
    for inv in invalid_samples[:3]:
        print(f"  ID {inv['sample_id']}: {inv['error']}")

# Use only valid samples
dataset = valid_samples
print(f"\n✓ Using {len(dataset)} validated samples")

## 4. Format Data for Instruction Tuning

Convert to the instruction-following format expected by the model.

In [None]:
def format_for_training(sample: Dict) -> Dict:
    """
    Format a sample for instruction tuning.
    
    Uses the Alpaca/Llama instruction format:
    ### Instruction:
    ### Input:
    ### Output:
    """
    instruction = """Extract the project estimation fields from the following Business Requirements Document.
Return a JSON object with these exact fields: effort_hours (number), timeline_weeks (number), cost_usd (number).
Return ONLY the JSON object, no additional text."""
    
    input_text = sample["brd_text"]
    
    output_json = json.dumps(sample["labels"], ensure_ascii=False)
    
    # Combine in instruction format
    formatted_text = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Output:
{output_json}"""
    
    return {
        "text": formatted_text,
        "id": sample["id"],
        "metadata": sample.get("metadata", {})
    }

# Format all samples
formatted_dataset = [format_for_training(sample) for sample in dataset]

print("Example formatted sample:")
print("=" * 80)
print(formatted_dataset[0]["text"])
print("=" * 80)
print(f"\n✓ Formatted {len(formatted_dataset)} samples")

## 5. Create Train/Val/Test Splits

Split: 80% train, 10% validation, 10% test

In [None]:
# First split: 80% train, 20% temp
train_data, temp_data = train_test_split(
    formatted_dataset,
    test_size=0.2,
    random_state=42
)

# Second split: 50% val, 50% test (from temp)
val_data, test_data = train_test_split(
    temp_data,
    test_size=0.5,
    random_state=42
)

print("Dataset Splits:")
print(f"  Training:   {len(train_data):4d} samples ({len(train_data)/len(formatted_dataset)*100:.1f}%)")
print(f"  Validation: {len(val_data):4d} samples ({len(val_data)/len(formatted_dataset)*100:.1f}%)")
print(f"  Test:       {len(test_data):4d} samples ({len(test_data)/len(formatted_dataset)*100:.1f}%)")
print(f"  Total:      {len(formatted_dataset):4d} samples")

print("\n✓ Data split created")

## 6. Analyze Data Statistics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract labels from original dataset for analysis
df = pd.DataFrame([sample["labels"] for sample in dataset])

print("Label Statistics:")
print("=" * 80)
print(df.describe())
print("=" * 80)

# Text length analysis
text_lengths = [len(sample["text"]) for sample in formatted_dataset]
print(f"\nText Length Statistics:")
print(f"  Mean:   {sum(text_lengths) / len(text_lengths):.0f} characters")
print(f"  Min:    {min(text_lengths)} characters")
print(f"  Max:    {max(text_lengths)} characters")
print(f"  Median: {sorted(text_lengths)[len(text_lengths)//2]} characters")

# Visualize distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(df["effort_hours"], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title("Effort Hours Distribution")
axes[0].set_xlabel("Effort (hours)")
axes[0].set_ylabel("Frequency")

axes[1].hist(df["timeline_weeks"], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1].set_title("Timeline Distribution")
axes[1].set_xlabel("Timeline (weeks)")
axes[1].set_ylabel("Frequency")

axes[2].hist(df["cost_usd"], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[2].set_title("Cost Distribution")
axes[2].set_xlabel("Cost (USD)")
axes[2].set_ylabel("Frequency")

plt.tight_layout()
plt.savefig("../data/processed/label_distributions.png", dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Statistics visualized and saved")

## 7. Save Formatted Datasets

In [None]:
from datetime import datetime

# Create output directory
output_dir = Path("../data/processed")
output_dir.mkdir(exist_ok=True)

# Save splits as JSON
splits = {
    "train": train_data,
    "validation": val_data,
    "test": test_data
}

for split_name, split_data in splits.items():
    output_file = output_dir / f"{split_name}.json"
    with open(output_file, "w") as f:
        json.dump(split_data, f, indent=2)
    print(f"✓ Saved {split_name}: {len(split_data)} samples -> {output_file}")

# Also save in JSONL format (one JSON per line, efficient for training)
for split_name, split_data in splits.items():
    output_file = output_dir / f"{split_name}.jsonl"
    with open(output_file, "w") as f:
        for sample in split_data:
            f.write(json.dumps(sample) + "\n")
    print(f"✓ Saved {split_name}: {len(split_data)} samples -> {output_file} (JSONL)")

## 8. Save Pydantic Schema

In [None]:
# Save schema definition for later use
schema_file = output_dir / "pydantic_schema.py"

schema_code = '''"""Pydantic schema for BRD project estimation extraction."""

from pydantic import BaseModel, Field, field_validator

class ProjectEstimation(BaseModel):
    """Schema for project estimation extraction from BRDs."""
    
    effort_hours: float = Field(
        gt=0,
        description="Total project effort in hours"
    )
    timeline_weeks: int = Field(
        gt=0,
        le=520,
        description="Project timeline in weeks"
    )
    cost_usd: float = Field(
        gt=0,
        description="Estimated project cost in USD"
    )
    
    @field_validator("timeline_weeks")
    @classmethod
    def validate_timeline(cls, v):
        if v > 104:  # 2 years
            raise ValueError("Timeline exceeds reasonable range for typical projects")
        return v
    
    @field_validator("cost_usd")
    @classmethod
    def validate_cost(cls, v, info):
        effort = info.data.get("effort_hours")
        if effort and v / effort < 10:
            raise ValueError("Cost per hour too low (minimum $10/hour)")
        return v

# JSON Schema (for use with outlines/instructor)
ESTIMATION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "effort_hours": {
            "type": "number",
            "minimum": 0,
            "exclusiveMinimum": True
        },
        "timeline_weeks": {
            "type": "integer",
            "minimum": 1,
            "maximum": 520
        },
        "cost_usd": {
            "type": "number",
            "minimum": 0,
            "exclusiveMinimum": True
        }
    },
    "required": ["effort_hours", "timeline_weeks", "cost_usd"],
    "additionalProperties": False
}
'''

with open(schema_file, "w") as f:
    f.write(schema_code)

print(f"✓ Pydantic schema saved to: {schema_file}")

## 9. Create Dataset Metadata

In [None]:
metadata = {
    "created_at": datetime.now().isoformat(),
    "total_samples": len(formatted_dataset),
    "splits": {
        "train": len(train_data),
        "validation": len(val_data),
        "test": len(test_data)
    },
    "label_statistics": {
        "effort_hours": {
            "mean": float(df["effort_hours"].mean()),
            "std": float(df["effort_hours"].std()),
            "min": float(df["effort_hours"].min()),
            "max": float(df["effort_hours"].max())
        },
        "timeline_weeks": {
            "mean": float(df["timeline_weeks"].mean()),
            "std": float(df["timeline_weeks"].std()),
            "min": int(df["timeline_weeks"].min()),
            "max": int(df["timeline_weeks"].max())
        },
        "cost_usd": {
            "mean": float(df["cost_usd"].mean()),
            "std": float(df["cost_usd"].std()),
            "min": float(df["cost_usd"].min()),
            "max": float(df["cost_usd"].max())
        }
    },
    "text_statistics": {
        "mean_length": sum(text_lengths) / len(text_lengths),
        "min_length": min(text_lengths),
        "max_length": max(text_lengths)
    },
    "format": "instruction_tuning",
    "instruction_template": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n{output}"
}

metadata_file = output_dir / "dataset_metadata.json"
with open(metadata_file, "w") as f:
    json.dump(metadata, f, indent=2)

print("Dataset Metadata:")
print("=" * 80)
print(json.dumps(metadata, indent=2))
print("=" * 80)
print(f"\n✓ Metadata saved to: {metadata_file}")

## 10. Verify Data is Ready for Training

In [None]:
print("Data Preparation Checklist:")
print("=" * 80)

checks = [
    ("✓", f"Total samples: {len(formatted_dataset)}"),
    ("✓", f"Training samples: {len(train_data)}"),
    ("✓", f"Validation samples: {len(val_data)}"),
    ("✓", f"Test samples: {len(test_data)}"),
    ("✓", "All labels validated with Pydantic"),
    ("✓", "Instruction tuning format applied"),
    ("✓", "Train/val/test splits created (80/10/10)"),
    ("✓", "Data saved in JSON and JSONL formats"),
    ("✓", "Pydantic schema saved"),
    ("✓", "Dataset metadata created"),
    ("✓", "Statistics and visualizations generated"),
]

for symbol, check in checks:
    print(f"{symbol} {check}")

print("=" * 80)
print("\n✅ Data preparation complete! Ready for training.")

## Summary

### What we've done:
- ✓ Loaded synthetic BRD dataset
- ✓ Defined Pydantic schema for validation
- ✓ Validated all ground truth labels
- ✓ Formatted data for instruction tuning
- ✓ Created train/val/test splits (80/10/10)
- ✓ Generated dataset statistics and visualizations
- ✓ Saved data in multiple formats (JSON, JSONL)
- ✓ Created metadata and documentation

### Files Created:
- `data/processed/train.json` and `train.jsonl`
- `data/processed/validation.json` and `validation.jsonl`
- `data/processed/test.json` and `test.jsonl`
- `data/processed/pydantic_schema.py`
- `data/processed/dataset_metadata.json`
- `data/processed/label_distributions.png`

### Next Steps:
Move on to `04_training.ipynb` to fine-tune the model with QLoRA!

### Notes:
- Data is in instruction-tuning format for optimal learning
- All labels pass Pydantic validation
- Dataset is balanced and diverse
- Ready for efficient training with TRL's SFTTrainer