# 01 - Exploratory Data Analysis (EDA)

This notebook explores datasets for training KlarText simplification models.

## Goals
- Understand available datasets (German & English)
- Analyze text length distributions
- Check language quality and edge cases
- Identify potential issues before training

## Datasets to Explore
- DEplain (German plain language)
- Klexikon (Simple German Wikipedia)
- Newsela (English multi-level news)
- WikiLarge (English Wikipedia simplification)


In [None]:
# Setup and imports
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path

# Optional: HuggingFace datasets
# from datasets import load_dataset

# Set paths
PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_SAMPLES = PROJECT_ROOT / "data" / "samples"

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directories exist: raw={DATA_RAW.exists()}, processed={DATA_PROCESSED.exists()}")


## 1. Load Datasets

Load datasets from HuggingFace Hub or local files.


In [None]:
# Example: Load a dataset from HuggingFace
# Uncomment and modify as needed

# from datasets import load_dataset

# # German: DEplain corpus
# deplain = load_dataset("DEplain/DEplain-web", split="train")
# print(f"DEplain: {len(deplain)} examples")
# print(deplain[0])

# # English: WikiLarge
# wikilarge = load_dataset("wiki_lingua", "english", split="train")

# For now, create sample data to demonstrate the analysis
sample_data = [
    {
        "source": "Der Antragsteller muss die erforderlichen Unterlagen innerhalb der gesetzlich vorgeschriebenen Frist einreichen, andernfalls wird der Antrag als unvollständig abgelehnt.",
        "target": "Sie müssen die Papiere rechtzeitig abgeben. Sonst wird Ihr Antrag abgelehnt.",
        "level": "easy",
        "lang": "de"
    },
    {
        "source": "The implementation of the proposed regulatory framework necessitates comprehensive stakeholder engagement and iterative refinement of operational procedures.",
        "target": "We need to talk to everyone involved. Then we can improve how things work step by step.",
        "level": "easy", 
        "lang": "en"
    },
]

df = pd.DataFrame(sample_data)
print(f"Sample data: {len(df)} examples")
df.head()


## 2. Text Length Analysis

Analyze the length of source and target texts.


In [None]:
def analyze_text_lengths(df: pd.DataFrame) -> pd.DataFrame:
    """Add text length columns to dataframe."""
    df = df.copy()
    df["source_chars"] = df["source"].str.len()
    df["target_chars"] = df["target"].str.len()
    df["source_words"] = df["source"].str.split().str.len()
    df["target_words"] = df["target"].str.split().str.len()
    df["compression_ratio"] = df["target_chars"] / df["source_chars"]
    return df

df = analyze_text_lengths(df)
print("Length statistics:")
df[["source_chars", "target_chars", "source_words", "target_words", "compression_ratio"]].describe()


In [None]:
# Visualization: Length distributions
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

# Source vs Target length
axes[0].scatter(df["source_words"], df["target_words"], alpha=0.6)
axes[0].plot([0, df["source_words"].max()], [0, df["source_words"].max()], 'r--', label='Equal length')
axes[0].set_xlabel("Source words")
axes[0].set_ylabel("Target words")
axes[0].set_title("Source vs Target Length")
axes[0].legend()

# Compression ratio by language
if len(df) > 1:
    df.groupby("lang")["compression_ratio"].mean().plot(kind="bar", ax=axes[1])
    axes[1].set_title("Avg Compression Ratio by Language")
    axes[1].set_ylabel("Ratio (target/source)")

# Word count distribution
axes[2].hist(df["source_words"], bins=20, alpha=0.5, label="Source")
axes[2].hist(df["target_words"], bins=20, alpha=0.5, label="Target")
axes[2].set_xlabel("Word count")
axes[2].set_title("Word Count Distribution")
axes[2].legend()

plt.tight_layout()
plt.show()


## 3. Vocabulary Analysis

Analyze word frequency and vocabulary differences between source and target.


In [None]:
def get_vocabulary_stats(texts: pd.Series, name: str = ""):
    """Get vocabulary statistics for a series of texts."""
    all_words = " ".join(texts).lower().split()
    word_freq = Counter(all_words)
    
    print(f"\n{name} Vocabulary Stats:")
    print(f"  Total words: {len(all_words):,}")
    print(f"  Unique words: {len(word_freq):,}")
    print(f"  Top 10 words: {word_freq.most_common(10)}")
    
    return word_freq

source_vocab = get_vocabulary_stats(df["source"], "Source")
target_vocab = get_vocabulary_stats(df["target"], "Target")

# Words only in source (complex words that got simplified)
source_only = set(source_vocab.keys()) - set(target_vocab.keys())
print(f"\nWords in source but not target (simplified away): {list(source_only)[:20]}")


## 4. Quality Checks

Check for potential data quality issues.


In [None]:
def quality_checks(df: pd.DataFrame) -> dict:
    """Run quality checks on the dataset."""
    issues = {}
    
    # Empty texts
    issues["empty_source"] = df["source"].str.strip().eq("").sum()
    issues["empty_target"] = df["target"].str.strip().eq("").sum()
    
    # Very short texts (< 10 chars)
    issues["short_source"] = (df["source"].str.len() < 10).sum()
    issues["short_target"] = (df["target"].str.len() < 10).sum()
    
    # Target longer than source (unusual for simplification)
    issues["target_longer"] = (df["target_chars"] > df["source_chars"]).sum()
    
    # Duplicates
    issues["duplicate_source"] = df["source"].duplicated().sum()
    issues["duplicate_pairs"] = df.duplicated(subset=["source", "target"]).sum()
    
    # Source == Target (no simplification)
    issues["identical_pairs"] = (df["source"] == df["target"]).sum()
    
    return issues

issues = quality_checks(df)
print("Quality Check Results:")
for check, count in issues.items():
    status = "✅" if count == 0 else "⚠️"
    print(f"  {status} {check}: {count}")


## 5. Next Steps

Based on this EDA:
- [ ] Download full datasets (see `data/README.md`)
- [ ] Filter out low-quality examples
- [ ] Proceed to `02_data_prep.ipynb` for preprocessing
- [ ] Consider which simplification levels to support
