# Model Evaluation Pipeline

This notebook evaluates both the baseline Whisper-small model and the fine-tuned model on the Hindi FLEURS test dataset.

## Steps:
1. Load FLEURS Hindi test dataset
2. Evaluate baseline Whisper-small
3. Load fine-tuned model from Hugging Face
4. Evaluate fine-tuned model
5. Compare results and generate report

In [1]:
!pip install -q "datasets==2.16.0" "huggingface-hub==0.20.0"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.1/329.1 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pathos 0.3.2 requires dill>=0.3.8, but you have dill 0.3.7 which is incompatible.
pathos 0.3.2 r

In [2]:
!pip install -U -q transformers  accelerate evaluate jiwer  soundfile librosa

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m98.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.8/375.8 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0

In [3]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from datasets import load_dataset
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    pipeline
)
import evaluate
from huggingface_hub import login
import warnings
warnings.filterwarnings('ignore')

load_dotenv()
torch.manual_seed(42)

2025-11-02 07:14:13.185692: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762067653.384879      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762067653.441085      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


<torch._C.Generator at 0x7cccb653b2d0>

## 1. Configuration

In [None]:
CONFIG = {
    'baseline_model': 'openai/whisper-small',
    'finetuned_model': 'datafreak/whisper-hindi',
    'language': 'hi',
    'batch_size': 4,
    'results_dir': '../results',
}

# Login to Hugging Face
HF_TOKEN = os.environ.get("HF_TOKEN", "<HF_TOKEN_PLACEHOLDER>")
if HF_TOKEN and HF_TOKEN != "<HF_TOKEN_PLACEHOLDER>":
    login(token=HF_TOKEN)
    print("✓ Logged in to Hugging Face")
else:
    print("⚠ HF_TOKEN not provided. Set the environment variable before publishing artifacts.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create results directory
os.makedirs(CONFIG['results_dir'], exist_ok=True)

✓ Logged in to Hugging Face
Using device: cuda


## 2. Load FLEURS Hindi Test Dataset

In [5]:
from datasets import load_dataset, Dataset

print("Loading FLEURS Hindi dataset...")

# Stream and collect test data
fleurs_asr = load_dataset("google/fleurs", "hi_in", streaming=True)

print("Converting test split to dataset...")
test_samples = list(fleurs_asr["test"])
fleurs_dataset = Dataset.from_list(test_samples)

print(f"✓ Loaded {len(fleurs_dataset)} test samples")
print(f"\nDataset features: {fleurs_dataset.features}")

# Now use it like normal
audio_input = fleurs_dataset[0]["audio"]
transcription = fleurs_dataset[0]["transcription"]
print(f"\nFirst transcription: {transcription}")

Loading FLEURS Hindi dataset...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Converting test split to dataset...
✓ Loaded 418 test samples

Dataset features: {'id': Value(dtype='int64', id=None), 'num_samples': Value(dtype='int64', id=None), 'path': Value(dtype='null', id=None), 'audio': {'array': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'path': Value(dtype='string', id=None), 'sampling_rate': Value(dtype='int64', id=None)}, 'transcription': Value(dtype='string', id=None), 'raw_transcription': Value(dtype='string', id=None), 'gender': Value(dtype='int64', id=None), 'lang_id': Value(dtype='int64', id=None), 'language': Value(dtype='string', id=None), 'lang_group_id': Value(dtype='int64', id=None)}

First transcription: कुछ अणुओं में अस्थिर केंद्रक होता है जिसका मतलब यह है कि उनमें थोड़े या बिना किसी झटके से टूटने की प्रवृत्ति होती है


## 3. Evaluation Function

In [6]:
def evaluate_model(model_name, dataset, batch_size=16):
    """
    Evaluate a Whisper model on the given dataset.
    """
    print(f"\nEvaluating model: {model_name}")
    print("="*60)
    
    # Load model and processor
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    model.to(device)
    model.eval()
    
    # Configure generation
    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
        language="hindi", 
        task="transcribe"
    )
    
    # Prepare predictions and references
    predictions = []
    references = []
    
    # Whisper expects 30 seconds of audio at 16kHz = 480,000 samples
    MAX_AUDIO_LENGTH = 30 * 16000  # 30 seconds at 16kHz sampling rate
    
    # Process in batches
    with torch.no_grad():
        for i in tqdm(range(0, len(dataset), batch_size), desc="Evaluating"):
            batch = dataset[i:i+batch_size]
            
            # Extract audio
            audio_arrays = [item['array'] for item in batch['audio']]
            
            # Process audio with correct max_length for audio samples
            inputs = processor(
                audio_arrays,
                sampling_rate=16000,
                return_tensors="pt",
                padding="max_length",
                max_length=MAX_AUDIO_LENGTH,  # This is in audio samples, not mel frames
                truncation=True
            )
            inputs = inputs.to(device)
            
            # Generate transcriptions
            generated_ids = model.generate(inputs.input_features)
            transcriptions = processor.batch_decode(
                generated_ids, 
                skip_special_tokens=True
            )
            
            predictions.extend(transcriptions)
            references.extend(batch['transcription'])
    
    # Calculate WER
    wer_metric = evaluate.load("wer")
    wer = 100 * wer_metric.compute(predictions=predictions, references=references)
    
    print(f"\n✓ Evaluation complete")
    print(f"  WER: {wer:.2f}%")
    print(f"  Total samples: {len(predictions)}")
    
    return {
        'model_name': model_name,
        'wer': wer,
        'predictions': predictions,
        'references': references,
        'num_samples': len(predictions)
    }

## 4. Evaluate Baseline Model

In [7]:
baseline_results = evaluate_model(
    CONFIG['baseline_model'],
    fleurs_dataset,
    batch_size=CONFIG['batch_size']
)


Evaluating model: openai/whisper-small


preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

Evaluating:   0%|          | 0/105 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Evaluating: 100%|██████████| 105/105 [09:00<00:00,  5.15s/it]


Downloading builder script: 0.00B [00:00, ?B/s]


✓ Evaluation complete
  WER: 84.64%
  Total samples: 418


## 5. Evaluate Fine-tuned Model

In [8]:
finetuned_results = evaluate_model(
    CONFIG['finetuned_model'],
    fleurs_dataset,
    batch_size=CONFIG['batch_size']
)


Evaluating model: datafreak/whisper-hindi


preprocessor_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

Evaluating:   0%|          | 0/105 [00:00<?, ?it/s]`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generat


✓ Evaluation complete
  WER: 38.20%
  Total samples: 418


## 6. Compare Results

In [9]:
# Create comparison table
comparison_df = pd.DataFrame([
    {
        'Model': 'Baseline (whisper-small)',
        'WER (%)': f"{baseline_results['wer']:.2f}",
        'Test Samples': baseline_results['num_samples']
    },
    {
        'Model': 'Fine-tuned (whisper-hindi)',
        'WER (%)': f"{finetuned_results['wer']:.2f}",
        'Test Samples': finetuned_results['num_samples']
    }
])

# Calculate improvement
wer_improvement = baseline_results['wer'] - finetuned_results['wer']
relative_improvement = (wer_improvement / baseline_results['wer']) * 100

print("\n" + "="*60)
print("EVALUATION RESULTS - FLEURS Hindi Test Set")
print("="*60)
print("\n" + comparison_df.to_string(index=False))
print("\n" + "="*60)
print(f"WER Improvement: {wer_improvement:.2f} percentage points")
print(f"Relative Improvement: {relative_improvement:.2f}%")
print("="*60)


EVALUATION RESULTS - FLEURS Hindi Test Set

                     Model WER (%)  Test Samples
  Baseline (whisper-small)   84.64           418
Fine-tuned (whisper-hindi)   38.20           418

WER Improvement: 46.45 percentage points
Relative Improvement: 54.87%


## 7. Save Results

In [10]:
# Save comparison table
comparison_df.to_csv(f"{CONFIG['results_dir']}/wer_comparison.csv", index=False)
print(f"✓ Comparison table saved to: {CONFIG['results_dir']}/wer_comparison.csv")

# Save detailed results
detailed_results = pd.DataFrame({
    'reference': baseline_results['references'],
    'baseline_prediction': baseline_results['predictions'],
    'finetuned_prediction': finetuned_results['predictions']
})
detailed_results.to_csv(f"{CONFIG['results_dir']}/detailed_predictions.csv", index=False)
print(f"✓ Detailed predictions saved to: {CONFIG['results_dir']}/detailed_predictions.csv")

✓ Comparison table saved to: ../results/wer_comparison.csv
✓ Detailed predictions saved to: ../results/detailed_predictions.csv


In [None]:
from huggingface_hub import HfApi, HfFolder
from pathlib import Path

api = HfApi()
token = HF_TOKEN if HF_TOKEN and HF_TOKEN != "<HF_TOKEN_PLACEHOLDER>" else HfFolder.get_token()
if not token:
    raise RuntimeError("No Hugging Face token available. Run the earlier login cell with a valid token.")

user_info = api.whoami(token)
username = user_info.get("name") or user_info.get("displayName")
if not username:
    raise RuntimeError("Unable to determine Hugging Face username from the configured token.")

repo_name = "whisper-hindi-eval-results"
repo_id = f"{username}/{repo_name}"
repo_type = "dataset"

api.create_repo(
    repo_id=repo_id,
    repo_type=repo_type,
    token=token,
    private=False,
    exist_ok=True
)

files_to_upload = {
    Path("../results/wer_comparison.csv"): "results/wer_comparison.csv",
    Path("../results/detailed_predictions.csv"): "results/detailed_predictions.csv",
}

for local_path, repo_path in files_to_upload.items():
    if not local_path.exists():
        raise FileNotFoundError(f"Missing artifact: {local_path}")
    api.upload_file(
        path_or_fileobj=local_path,
        path_in_repo=repo_path,
        repo_id=repo_id,
        repo_type=repo_type,
        token=token
    )
    print(f"✓ Uploaded {local_path} to hf://{repo_id}/{repo_path}")

print(f"All artifacts available at: https://huggingface.co/datasets/{repo_id}")

✓ Uploaded ../results/wer_comparison.csv to hf://datafreak/whisper-hindi-eval-results/results/wer_comparison.csv
✓ Uploaded ../results/detailed_predictions.csv to hf://datafreak/whisper-hindi-eval-results/results/detailed_predictions.csv
All artifacts available at: https://huggingface.co/datasets/datafreak/whisper-hindi-eval-results


## 8. Sample Predictions Analysis

In [23]:
print("\n" + "="*60)
print("SAMPLE PREDICTIONS")
print("="*60)

# Show 5 random samples
sample_indices = np.random.choice(len(detailed_results), 5, replace=False)

for idx in sample_indices:
    print(f"\nSample {idx + 1}:")
    print("-" * 60)
    print(f"Reference:\n  {detailed_results.iloc[idx]['reference']}")
    print(f"\nBaseline:\n  {detailed_results.iloc[idx]['baseline_prediction']}")
    print(f"\nFine-tuned:\n  {detailed_results.iloc[idx]['finetuned_prediction']}")
    print("-" * 60)


SAMPLE PREDICTIONS

Sample 270:
------------------------------------------------------------
Reference:
  तूफान और बवंडर की तरह आंधी ओले भारी बारिश और जंगल की आग तीव्र मौसम का हिस्सा और असर हैं

Baseline:
   तूफान और भवन्दर की तड़ा आन्दि ओले भारी भारिश और जंगल की आख तीव्र मोसम का हिस्था और असर है

Fine-tuned:
  तुफान और भवंदर की तरह आंधी ओले भारी भारिश और जंगल की आग तीव्र मौसम का हिस्सा और असर है
------------------------------------------------------------

Sample 396:
------------------------------------------------------------
Reference:
  उदाहरण के लिए दुनिया में सबसे ज़्यादा इस्तेमाल होने वाला फ़ोटोग्राफ़ी फ़ॉर्मेट 35mm है यह एनालॉग फिल्म दौर के आखिर में काफी प्रचलित था

Baseline:
   उदाहरन के लिए तुन्यमें सब सी जाडा इस्तमाल होनी वाला फोटोग्राफी फोरमेट पैटिस मेंव है यहा एना लोग फिर्मे दोर के आखिर में काफी प्रचलीते ते ता

Fine-tuned:
  उदाहरन के लिए तूनिया में सबसी ज्यादा इस्तमाल होनी वाला फोटोग्राफी फॉर्मेट पैंतिस एमम है यहां एना लोग फिर में दौड़ के आखीर में काफी प्रचलीते था
-----

## 9. Generate Final Report

In [24]:
report = f"""
# Whisper-Small Hindi ASR Evaluation Report

## Dataset
- **Test Set**: FLEURS Hindi (google/fleurs, hi_in split)
- **Number of Samples**: {baseline_results['num_samples']}

## Models Evaluated
1. **Baseline**: {CONFIG['baseline_model']}
2. **Fine-tuned**: {CONFIG['finetuned_model']}

## Results

| Model | WER (%) |
|-------|--------|
| Baseline (whisper-small) | {baseline_results['wer']:.2f} |
| Fine-tuned (whisper-hindi) | {finetuned_results['wer']:.2f} |

## Performance Improvement
- **Absolute WER Reduction**: {wer_improvement:.2f} percentage points
- **Relative Improvement**: {relative_improvement:.2f}%

## Conclusion
The fine-tuned model {'improved' if wer_improvement > 0 else 'did not improve'} over the baseline model on the FLEURS Hindi test set.

---
Generated using evaluation_pipeline.ipynb
"""

with open(f"{CONFIG['results_dir']}/evaluation_report.md", 'w', encoding='utf-8') as f:
    f.write(report)

print("\n✓ Final report saved to:", f"{CONFIG['results_dir']}/evaluation_report.md")
print("\n" + report)


✓ Final report saved to: ../results/evaluation_report.md


# Whisper-Small Hindi ASR Evaluation Report

## Dataset
- **Test Set**: FLEURS Hindi (google/fleurs, hi_in split)
- **Number of Samples**: 418

## Models Evaluated
1. **Baseline**: openai/whisper-small
2. **Fine-tuned**: datafreak/whisper-hindi

## Results

| Model | WER (%) |
|-------|--------|
| Baseline (whisper-small) | 84.64 |
| Fine-tuned (whisper-hindi) | 38.20 |

## Performance Improvement
- **Absolute WER Reduction**: 46.45 percentage points
- **Relative Improvement**: 54.87%

## Conclusion
The fine-tuned model improved over the baseline model on the FLEURS Hindi test set.

---
Generated using evaluation_pipeline.ipynb



In [13]:
from huggingface_hub import hf_hub_download
import pandas as pd
import shutil

download_cache = Path("../artifacts/hf_downloads")
download_cache.mkdir(parents=True, exist_ok=True)

hf_comparison_path = hf_hub_download(
    repo_id=repo_id,
    repo_type=repo_type,
    filename="results/wer_comparison.csv",
    token=token,
    local_dir=download_cache
 )
hf_detailed_path = hf_hub_download(
    repo_id=repo_id,
    repo_type=repo_type,
    filename="results/detailed_predictions.csv",
    token=token,
    local_dir=download_cache
 )

hf_comparison_df = pd.read_csv(hf_comparison_path)
hf_detailed_results = pd.read_csv(hf_detailed_path)

print("Loaded artifacts from Hugging Face dataset repo:")
print(f"- {hf_comparison_path}")
print(f"- {hf_detailed_path}")

# Optional: clean up HF cache directory to avoid stale files on reruns
shutil.rmtree(download_cache, ignore_errors=True)

wer_comparison.csv:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

detailed_predictions.csv: 0.00B [00:00, ?B/s]

Loaded artifacts from Hugging Face dataset repo:
- ../artifacts/hf_downloads/results/wer_comparison.csv
- ../artifacts/hf_downloads/results/detailed_predictions.csv


In [14]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from jiwer import wer as jiwer_wer

plt.style.use("ggplot")

graphs_dir = Path("../graphs")
graphs_dir.mkdir(parents=True, exist_ok=True)

if 'hf_detailed_results' not in globals() or 'hf_comparison_df' not in globals():
    raise RuntimeError("Run the Hugging Face download cell before generating plots.")

analysis_df = hf_detailed_results.copy()
analysis_df["reference"] = analysis_df["reference"].fillna("")
analysis_df["baseline_prediction"] = analysis_df["baseline_prediction"].fillna("")
analysis_df["finetuned_prediction"] = analysis_df["finetuned_prediction"].fillna("")

analysis_df["reference_word_count"] = (
    analysis_df["reference"].str.split().str.len().fillna(0).astype(int)
)

def safe_wer(reference: str, hypothesis: str) -> float:
    reference = reference or ""
    hypothesis = hypothesis or ""
    if not reference.strip():
        return np.nan
    return float(jiwer_wer(reference, hypothesis))

analysis_df["baseline_wer"] = analysis_df.apply(
    lambda row: safe_wer(row["reference"], row["baseline_prediction"]), axis=1
)
analysis_df["finetuned_wer"] = analysis_df.apply(
    lambda row: safe_wer(row["reference"], row["finetuned_prediction"]), axis=1
)
analysis_df["wer_improvement"] = analysis_df["baseline_wer"] - analysis_df["finetuned_wer"]

comparison_df = hf_comparison_df.copy()
comparison_df["WER (%)"] = pd.to_numeric(comparison_df["WER (%)"], errors="coerce")
baseline_row = comparison_df[comparison_df["Model"].str.contains("Baseline", case=False, na=False)]
finetuned_row = comparison_df[comparison_df["Model"].str.contains("Fine", case=False, na=False)]

if baseline_row.empty or finetuned_row.empty:
    raise RuntimeError("Could not locate baseline/fine-tuned rows in the comparison table.")
baseline_wer = float(baseline_row.iloc[0]["WER (%)"])
finetuned_wer = float(finetuned_row.iloc[0]["WER (%)"])

saved_plots: list[str] = []

# 1. Overall WER comparison
fig, ax = plt.subplots(figsize=(6, 4))
models = ["Baseline", "Fine-tuned"]
wer_values = [baseline_wer, finetuned_wer]
bars = ax.bar(models, wer_values, color=["#d62728", "#2ca02c"])
ax.set_ylabel("WER (%)")
ax.set_title("Word Error Rate Comparison (Hub Data)")
for bar, value in zip(bars, wer_values):
    ax.text(bar.get_x() + bar.get_width() / 2, value + 0.5, f"{value:.2f}", ha="center", va="bottom")
fig.tight_layout()
fig.savefig(graphs_dir / "wer_comparison.png", dpi=200)
plt.close(fig)
saved_plots.append("wer_comparison.png")

# 2. Per-sample WER distribution
valid_mask = analysis_df[["baseline_wer", "finetuned_wer"]].notna().all(axis=1)
if valid_mask.any():
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(
        analysis_df.loc[valid_mask, "baseline_wer"] * 100,
        bins=30,
        alpha=0.6,
        label="Baseline",
        color="#ff7f0e"
    )
    ax.hist(
        analysis_df.loc[valid_mask, "finetuned_wer"] * 100,
        bins=30,
        alpha=0.6,
        label="Fine-tuned",
        color="#1f77b4"
    )
    ax.set_xlabel("Per-sample WER (%)")
    ax.set_ylabel("Number of Utterances")
    ax.set_title("Distribution of Per-sample WER (Hub Data)")
    ax.legend()
    fig.tight_layout()
    fig.savefig(graphs_dir / "wer_distribution.png", dpi=200)
    plt.close(fig)
    saved_plots.append("wer_distribution.png")

# 3. Top improvements
if analysis_df["wer_improvement"].notna().any():
    top_improvements = analysis_df.sort_values("wer_improvement", ascending=False).head(20)
    fig, ax = plt.subplots(figsize=(10, 6))
    improvement_percent = top_improvements["wer_improvement"] * 100
    bars = ax.barh(range(len(top_improvements)), improvement_percent, color="#2ca02c")
    ax.set_yticks(range(len(top_improvements)))
    ax.set_yticklabels([f"Sample {idx}" for idx in top_improvements.index])
    ax.set_xlabel("WER Improvement (percentage points)")
    ax.set_title("Top 20 Utterances with Largest WER Gains (Hub Data)")
    ax.invert_yaxis()
    for bar, value in zip(bars, improvement_percent):
        ax.text(value + 0.2, bar.get_y() + bar.get_height() / 2, f"{value:.2f}", va="center")
    fig.tight_layout()
    fig.savefig(graphs_dir / "top_wer_improvements.png", dpi=200)
    plt.close(fig)
    saved_plots.append("top_wer_improvements.png")

# 4. Reference length vs improvement
if analysis_df["reference_word_count"].notna().any() and analysis_df["wer_improvement"].notna().any():
    fig, ax = plt.subplots(figsize=(8, 5))
    scatter = ax.scatter(
        analysis_df["reference_word_count"],
        analysis_df["wer_improvement"] * 100,
        c=analysis_df["wer_improvement"] * 100,
        cmap="RdYlGn",
        alpha=0.7
    )
    ax.set_xlabel("Reference Word Count")
    ax.set_ylabel("WER Improvement (percentage points)")
    ax.set_title("Effect of Utterance Length on WER Improvement (Hub Data)")
    fig.colorbar(scatter, ax=ax, label="WER Improvement (pp)")
    fig.tight_layout()
    fig.savefig(graphs_dir / "length_vs_improvement.png", dpi=200)
    plt.close(fig)
    saved_plots.append("length_vs_improvement.png")

print("Saved evaluation graphs based on Hugging Face data:")
for name in saved_plots:
    print(f"- {name}")
if not saved_plots:
    print("(No plots generated — check input data)")

Saved evaluation graphs based on Hugging Face data:
- wer_comparison.png
- wer_distribution.png
- top_wer_improvements.png
- length_vs_improvement.png


In [15]:
from pathlib import Path

graphs_dir = Path("../graphs")
if not graphs_dir.exists():
    raise FileNotFoundError("Graphs directory not found. Run the visualization cell before uploading charts.")

image_paths = sorted(graphs_dir.glob("*.png"))
if not image_paths:
    raise FileNotFoundError("No PNG files detected in ../graphs. Generate the plots first.")

for image_path in image_paths:
    api.upload_file(
        path_or_fileobj=image_path,
        path_in_repo=f"graphs/{image_path.name}",
        repo_id=repo_id,
        repo_type=repo_type,
        token=token
    )
    print(f"✓ Uploaded {image_path} to hf://{repo_id}/graphs/{image_path.name}")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✓ Uploaded ../graphs/length_vs_improvement.png to hf://datafreak/whisper-hindi-eval-results/graphs/length_vs_improvement.png


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✓ Uploaded ../graphs/top_wer_improvements.png to hf://datafreak/whisper-hindi-eval-results/graphs/top_wer_improvements.png


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✓ Uploaded ../graphs/wer_comparison.png to hf://datafreak/whisper-hindi-eval-results/graphs/wer_comparison.png


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✓ Uploaded ../graphs/wer_distribution.png to hf://datafreak/whisper-hindi-eval-results/graphs/wer_distribution.png


## Summary

This notebook has:
1. ✓ Loaded the FLEURS Hindi test dataset
2. ✓ Evaluated the baseline Whisper-small model
3. ✓ Evaluated the fine-tuned model from Hugging Face
4. ✓ Compared the results
5. ✓ Generated a detailed report

All results have been saved to the `results/` directory.