# Whisper-Small Hindi Fine-tuning Pipeline

This notebook fine-tunes the Whisper-small model on Hindi ASR data and pushes it to Hugging Face Hub.

## Steps:
1. Load and preprocess the dataset
2. Setup Whisper model and feature extractor
3. Create data collator and training arguments
4. Fine-tune the model
5. Push to Hugging Face Hub

In [1]:
!pip install -q transformers datasets accelerate evaluate jiwer huggingface-hub soundfile librosa

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.

In [2]:
import os
import torch
import pandas as pd
import numpy as np
from pathlib import Path
import librosa  # <-- ADDED THIS
from datasets import Dataset, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from huggingface_hub import HfApi, login
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

2025-11-01 19:02:29.781736: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762023749.997164      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762023750.059468      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## 1. Configuration and Setup

In [None]:
# Configuration
CONFIG = {
    'model_name': 'openai/whisper-small',
    'language': 'hindi',
    'language_code': 'hi',
    'task': 'transcribe',
    'data_dir': '/kaggle/input/whisper-hindhi/data', # Corrected this path
    'output_dir': '/kaggle/working/models/whisper-small-hindi',   
    'hf_repo': 'datafreak/whisper-hindi',
    
    # Training hyperparameters
    'learning_rate': 1e-5,
    'batch_size': 4,
    'gradient_accumulation_steps': 2,
    'num_epochs': 3,
    'warmup_steps': 500,
    'max_duration': 30.0,
    'eval_steps': 500,
    'save_steps': 500,
    'logging_steps': 100,
    
    # Data processing
    'sampling_rate': 16000,
    'train_split': 0.95,
    'max_label_length': 448,
}

# Get HuggingFace token
HF_TOKEN = os.environ.get("HF_TOKEN", "<HF_TOKEN_PLACEHOLDER>")
if HF_TOKEN and HF_TOKEN != "<HF_TOKEN_PLACEHOLDER>":
    login(token=HF_TOKEN)
    print("✓ Logged in to Hugging Face Hub")
else:
    print("⚠ HF_TOKEN not provided. Set the environment variable before pushing to Hub.")

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

✓ Logged in to Hugging Face Hub
Using device: cuda
GPU: Tesla T4


## 2. Load Dataset

In [4]:
import pandas as pd
from pathlib import Path
import json

def load_downloaded_data(data_dir):
    data_dir = Path(data_dir)
    base_dir = data_dir.parent 
    
    results_file = data_dir / 'download_results.csv'

    if not results_file.exists():
        print(f"Error: 'download_results.csv' not found at {results_file}")
        return []

    df = pd.read_csv(results_file)
    print(f"Found {len(df)} total rows in download_results.csv")

    df = df[(df['audio_success'] == True) & (df['transcription_success'] == True)]
    print(f"Found {len(df)} rows after filtering for success=True")
    
    if len(df) == 0:
        print("No successful rows to process. Stopping.")
        return []

    # --- THIS IS THE NEW LOGIC ---
    data = []
    for index, row in df.iterrows():
        csv_audio_path = str(row['audio_path']).replace('\\', '/')
        audio_path = base_dir / csv_audio_path
        csv_trans_path = str(row['transcription_path']).replace('\\', '/')
        trans_path = base_dir / csv_trans_path

        if not audio_path.exists() or not trans_path.exists():
            print(f"--- SKIPPING ROW {index}: File not found ---")
            continue

        try:
            # Load the transcription file
            with open(trans_path, 'r', encoding='utf-8') as f:
                raw_transcription = f.read().strip()
            
            # Parse the JSON (assuming it's a list of segments as you showed)
            segments = json.loads(raw_transcription)
            
            # Handle cases where it's a dict like {'segments': [...]}
            if isinstance(segments, dict) and 'segments' in segments:
                segments = segments['segments']
            
            # Ensure segments is a list
            if not isinstance(segments, list):
                print(f"--- SKIPPING ROW {index}: Tx format not a list of segments ---")
                continue

            # NOW, create one entry FOR EACH SEGMENT
            for seg in segments:
                if 'text' in seg and 'start' in seg and 'end' in seg:
                    text = seg['text'].strip()
                    start_time = float(seg['start'])
                    end_time = float(seg['end'])
                    
                    # Ignore empty segments
                    if not text or text == "REDACTED":
                        continue
                        
                    data.append({
                        'audio_path': str(audio_path), # Path to the full audio
                        'start_time': start_time,      # Start of this chunk
                        'end_time': end_time,        # End of this chunk
                        'transcription': text          # Text for *only* this chunk
                    })

        except Exception as e:
            print(f"--- SKIPPING ROW {index}: Error parsing {trans_path}: {e} ---")
    # --- END OF NEW LOGIC ---

    print(f"\nLoaded {len(data)} individual segments (chunks) from {len(df)} files")
    return data

In [5]:
data = load_downloaded_data(CONFIG['data_dir'])

Found 104 total rows in download_results.csv
Found 104 rows after filtering for success=True

Loaded 5732 individual segments (chunks) from 104 files


In [6]:
df = pd.DataFrame(data)

# Check if data loading failed
if len(df) == 0:
    raise ValueError("Data loading resulted in 0 samples. Cannot proceed.")

train_size = int(len(df) * CONFIG['train_split'])
train_df = df[:train_size]
val_df = df[train_size:]

# Create dataset from pandas WITHOUT casting the audio column
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

print(f"Train chunks: {len(train_dataset)}, Val chunks: {len(val_dataset)}")

Train chunks: 5445, Val chunks: 287


## 3. Setup Model Components

In [7]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(CONFIG['model_name'])
tokenizer = WhisperTokenizer.from_pretrained(CONFIG['model_name'], language='hindi', task='transcribe')
processor = WhisperProcessor.from_pretrained(CONFIG['model_name'], language='hindi', task='transcribe')

MAX_DURATION_SEC = CONFIG['max_duration']
MAX_LABEL_LENGTH = CONFIG['max_label_length']
SAMPLING_RATE = CONFIG['sampling_rate']

def prepare_dataset(batch):
    try:
        # Load the *entire* audio file
        # We use 'offset' and 'duration' to load only the part we need
        start_time = batch['start_time']
        end_time = batch['end_time']
        duration = end_time - start_time
        
        # --- FILTERING ---
        # 1. Check audio duration from timestamps
        if duration > MAX_DURATION_SEC:
            print(f"SKIPPING: Segment duration {duration}s > {MAX_DURATION_SEC}s")
            return {"input_features": None, "labels": None}

        # Load only the required audio chunk
        audio_array, sampling_rate = librosa.load(
            batch["audio_path"], 
            sr=SAMPLING_RATE, 
            offset=start_time, 
            duration=duration
        )
        
        # 2. Check text length
        tokenized_labels = tokenizer(batch["transcription"]).input_ids
        if len(tokenized_labels) > MAX_LABEL_LENGTH:
            print(f"SKIPPING: Text length {len(tokenized_labels)} > {MAX_LABEL_LENGTH}")
            return {"input_features": None, "labels": None}
        # --- END OF FILTERING ---

        # If both are fine, process the audio
        batch["input_features"] = feature_extractor(audio_array, sampling_rate=SAMPLING_RATE).input_features[0]
        batch["labels"] = tokenized_labels
        
    except Exception as e:
        print(f"Error processing segment from {batch['audio_path']} ({batch['start_time']}s): {e}")
        batch["input_features"] = None
        batch["labels"] = None
        
    return batch

print("Mapping train dataset...")
train_dataset = train_dataset.map(
    prepare_dataset, 
    remove_columns=train_dataset.column_names
)
print("Mapping validation dataset...")
val_dataset = val_dataset.map(
    prepare_dataset, 
    remove_columns=val_dataset.column_names
)

# Filter out all the samples that were marked for skipping
train_dataset = train_dataset.filter(lambda example: example["input_features"] is not None)
val_dataset = val_dataset.filter(lambda example: example["input_features"] is not None)

print("--- Dataset after mapping and filtering ---")
print(f"Train samples remaining: {len(train_dataset)}")
print(f"Validation samples remaining: {len(val_dataset)}")
print(train_dataset)
print(val_dataset)

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Mapping train dataset...


Map:   0%|          | 0/5445 [00:00<?, ? examples/s]

Mapping validation dataset...


Map:   0%|          | 0/287 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5445 [00:00<?, ? examples/s]

Filter:   0%|          | 0/287 [00:00<?, ? examples/s]

--- Dataset after mapping and filtering ---
Train samples remaining: 5445
Validation samples remaining: 287
Dataset({
    features: ['input_features', 'labels'],
    num_rows: 5445
})
Dataset({
    features: ['input_features', 'labels'],
    num_rows: 287
})


## 4. Data Collator & Metrics

In [8]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=tokenizer.bos_token_id,
)

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

Downloading builder script: 0.00B [00:00, ?B/s]

## 5. Initialize Model & Trainer

In [9]:
model = WhisperForConditionalGeneration.from_pretrained(CONFIG['model_name'])
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False
model.generation_config.language = 'hi'
model.generation_config.task = 'transcribe'

training_args = Seq2SeqTrainingArguments(
    output_dir=CONFIG['output_dir'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    learning_rate=CONFIG['learning_rate'],
    warmup_steps=CONFIG['warmup_steps'],
    num_train_epochs=CONFIG['num_epochs'],
    
    # RENAMED from evaluation_strategy to eval_strategy
    eval_strategy="steps", 
    eval_steps=CONFIG['eval_steps'],
    
    # Explicitly set save_strategy to match
    save_strategy="steps",
    save_steps=CONFIG['save_steps'],
    
    logging_steps=CONFIG['logging_steps'],
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    report_to=["tensorboard"],
    predict_with_generate=True,
    generation_max_length=448,
    remove_unused_columns=False,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

## 6. Train

In [10]:
print("Starting training...")
train_result = trainer.train()
print(f"Training complete! Loss: {train_result.metrics['train_loss']:.4f}")

Starting training...


Step,Training Loss,Validation Loss,Wer
500,0.3384,0.382823,43.531094
1000,0.202,0.323355,37.080057


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Training complete! Loss: 0.4620


## 7. Evaluate

In [11]:
eval_results = trainer.evaluate()
print(f"Validation WER: {eval_results['eval_wer']:.2f}%")

Validation WER: 37.08%


## 8. Save & Push to Hub

In [None]:
trainer.save_model(CONFIG['output_dir'])
processor.save_pretrained(CONFIG['output_dir'])

if HF_TOKEN and HF_TOKEN != "<HF_TOKEN_PLACEHOLDER>":
    print(f"Pushing to {CONFIG['hf_repo']}...")
    model.push_to_hub(CONFIG['hf_repo'], token=HF_TOKEN)
    processor.push_to_hub(CONFIG['hf_repo'], token=HF_TOKEN)
    print(f"✓ Model pushed to https://huggingface.co/{CONFIG['hf_repo']}")
else:
    print("⚠ HF_TOKEN not set. Skipping push to Hub.")

Pushing to datafreak/whisper-hindi...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

✓ Model pushed to https://huggingface.co/datafreak/whisper-hindi
