# 02 - Data Preparation

This notebook handles data cleaning, preprocessing, and train/val/test splitting.

## Goals
- Clean and filter raw datasets
- Normalize text (whitespace, encoding)
- Create train/validation/test splits
- Export in format suitable for fine-tuning


In [None]:
# Setup
import json
import re
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

print(f"Raw data: {DATA_RAW}")
print(f"Processed data: {DATA_PROCESSED}")


## 1. Text Cleaning Functions


In [None]:
def clean_text(text: str) -> str:
    """Clean and normalize text."""
    if not isinstance(text, str):
        return ""
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    # Remove control characters (keep newlines for structure)
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    
    return text


def filter_example(row: pd.Series, min_source_len: int = 20, max_source_len: int = 2000) -> bool:
    """Return True if example should be kept."""
    source = row.get("source", "")
    target = row.get("target", "")
    
    # Must have both source and target
    if not source or not target:
        return False
    
    # Length constraints
    if len(source) < min_source_len or len(source) > max_source_len:
        return False
    
    # Target should be shorter or similar length (not way longer)
    if len(target) > len(source) * 1.5:
        return False
    
    # Source and target should be different
    if source.strip() == target.strip():
        return False
    
    return True


# Test the functions
test_text = "  This   has   extra   whitespace.  "
print(f"Original: '{test_text}'")
print(f"Cleaned: '{clean_text(test_text)}'")


## 2. Load and Clean Data


In [None]:
# TODO: Load your actual dataset here
# Example with sample data:

sample_data = [
    {"source": "Complex legal text here...", "target": "Simple version.", "lang": "de", "level": "easy"},
    {"source": "Another complex text...", "target": "Easier to read.", "lang": "de", "level": "easy"},
]

df = pd.DataFrame(sample_data)

# Clean texts
df["source"] = df["source"].apply(clean_text)
df["target"] = df["target"].apply(clean_text)

# Filter
df["keep"] = df.apply(filter_example, axis=1)
print(f"Before filter: {len(df)}")
df = df[df["keep"]].drop(columns=["keep"])
print(f"After filter: {len(df)}")


## 3. Train/Val/Test Split


In [None]:
def create_splits(df: pd.DataFrame, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=42):
    """Split data into train/val/test sets."""
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 0.001
    
    # First split: train vs (val + test)
    train_df, temp_df = train_test_split(df, train_size=train_ratio, random_state=seed)
    
    # Second split: val vs test
    relative_val = val_ratio / (val_ratio + test_ratio)
    val_df, test_df = train_test_split(temp_df, train_size=relative_val, random_state=seed)
    
    return train_df, val_df, test_df

# Create splits (only if we have enough data)
if len(df) >= 3:
    train_df, val_df, test_df = create_splits(df)
    print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
else:
    print("Not enough data for splitting. Add more examples.")


## 4. Export to JSONL


In [None]:
def export_jsonl(df: pd.DataFrame, path: Path):
    """Export dataframe to JSONL format."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            json.dump(row.to_dict(), f, ensure_ascii=False)
            f.write('\n')
    print(f"Exported {len(df)} examples to {path}")

# Export (uncomment when you have real data)
# export_jsonl(train_df, DATA_PROCESSED / "train.jsonl")
# export_jsonl(val_df, DATA_PROCESSED / "val.jsonl")
# export_jsonl(test_df, DATA_PROCESSED / "test.jsonl")

print("Ready to export. Uncomment the lines above when you have real data.")
