In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


We'll employ several popular Python packages to fine-tune the Whisper model.

In [None]:
!apt update -qq && apt install -y ffmpeg
!pip install --upgrade pip
!pip install --upgrade datasets==3.5.1 transformers accelerate evaluate jiwer tensorboard gradio trackio peft bitsandbytes soundfile librosa
!pip install optipfair  # For post-training pruning
!pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q accelerate -U

39 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Looking in indexes: https://download.pytorch.org/whl/cu121


Directly uploading model checkpoints to the Hugging Face Hub while training.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Imports and setup

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from datasets import load_dataset, DatasetDict, Features, Value, Audio, DownloadConfig
from transformers import (
    WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor,
    WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import jiwer
import soundfile as sf
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import gradio as gr
from huggingface_hub import hf_hub_download  # For pawa-min-alpha if bonus
print("Setup complete.")

Setup complete.


Load and Prepare Training/Validation Data (Common Voice 17.0 Swahili)
Uses custom schema to avoid CastError, streams for low memory. Combines train+validation for more data (~80K samples), uses test for validation. Subsample train to 20K for quick testing (remove [:20000] for full).

In [None]:
# 1. Define a schema that matches the *actual* Arrow types in CV-17
cv_sw_features = Features({
    "client_id": Value("string"),
    "path": Value("string"),
    "sentence_id": Value("string"),
    "sentence": Value("string"),
    "sentence_domain": Value("string"),
    "up_votes": Value("string"),  # <- string, not int64
    "down_votes": Value("string"),  # <- string, not int64
    "age": Value("string"),
    "gender": Value("string"),
    "variant": Value("string"),
    "locale": Value("string"),
    "segment": Value("string"),
    "accent": Value("string"),
    # keep audio decoded so we get "array" + "sampling_rate"
    "audio": Audio(sampling_rate=16_000, mono=True, decode=True),
})

# Add download config for efficient loading and cache management (optional enhancement for low storage)
download_config = DownloadConfig(force_download=True, cache_dir="/tmp/hf_cache/", use_cache=False)

# 2. Stream the train split with that schema
cv_17 = load_dataset(
    "mozilla-foundation/common_voice_17_0",
    "sw",
    split="train+validation",  # Combined for more training data
    streaming=True,
    features=cv_sw_features,  # <- custom schema solves CastError
    download_config=download_config
)

cv_17 = cv_17.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(cv_17)

# Stream the validation split (CV test) with the same schema
cv_17_val = load_dataset(
    "mozilla-foundation/common_voice_17_0",
    "sw",
    split="test",
    streaming=True,
    features=cv_sw_features,  # <- custom schema solves CastError
    download_config=download_config
)

cv_17_val = cv_17_val.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(cv_17_val)

# Combine into DatasetDict for consistency
common_voice = DatasetDict({
    "train": cv_17,
    "validation": cv_17_val
})

print(common_voice)  # Check overall structure

# 3. Grab the first row from train for verification
first_row = next(iter(common_voice["train"]))

# 4. Save the audio clip
audio_array = first_row["audio"]["array"]
sr = first_row["audio"]["sampling_rate"]
sf.write("first_row_audio.wav", audio_array, sr)
print("Saved:", first_row["sentence"])
print("‚Üí first_row_audio.wav | shape:", audio_array.shape, " sr:", sr)

In [None]:
from IPython.display import Audio, display
display(Audio("first_row_audio.wav"))

In [None]:
model_name_or_path = "openai/whisper-tiny"  # Tiny for <4GB; swap to "base" if WER needs boost
language = "Swahili"
task = "transcribe"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Load Processor and Prepare Dataset

Define Data Collator and Metrics

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

metric = evaluate.load("wer")

def compute_metrics(eval_pred):
    pred_ids = eval_pred.predictions
    label_ids = eval_pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

Downloading builder script: 0.00B [00:00, ?B/s]

Load Model with LoRA + 4-bit Quantization

In [None]:
from transformers import BitsAndBytesConfig
from peft import LoraConfig, TaskType

# 1. Fix quantization config (replace deprecated load_in_4bit)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# 2. Load model with correct parameters
model = WhisperForConditionalGeneration.from_pretrained(
    model_name_or_path,
    quantization_config=quantization_config,
    device_map="auto",
    dtype=torch.float16
)

# 3. Fix LoRA config (use task_type instead of task)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
model.config.use_cache = False
model.gradient_checkpointing_enable()
print("Model loaded successfully!")

trainable params: 442,368 || all params: 38,203,008 || trainable%: 1.1579
Model loaded successfully!


Training Configuration and Run

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./swahili-whisper-tiny-lora",
    per_device_train_batch_size=2,  # Small for <4GB
    gradient_accumulation_steps=4,  # Effective batch=8
    learning_rate=1e-4,
    warmup_steps=500,
    num_train_epochs=3,  # 3 epochs for <13% WER
    evaluation_strategy="epoch",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=100,
    fp16=True,  # Mixed precision
    report_to="trackio",  # Local logging
    run_name="swahili-asr-tiny-lora",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    dataloader_num_workers=0,  # Colab stable
    remove_unused_columns=False,
    label_names=["labels"],
    hub_model_id = "marcoharuni95/asr_starter"
    push_to_hub=True,
)


In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=cv_17["train"],
    eval_dataset=cv_17["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)

In [None]:
processor.save_pretrained(training_args.output_dir)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
locale.getpreferredencoding()

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
max_memory = round(gpu_stats.total_memory / 1024**3, 3)
start_reserved = round(torch.cuda.max_memory_reserved() / 1024**3, 3)

print(f"üñ•Ô∏è GPU: {gpu_stats.name}")
print(f"üìä Max memory: {max_memory} GB")
print(f"üîπ Reserved before Inference: {start_reserved} GB")

In [None]:
trainer.train()
trainer.save_model("./swahili-whisper-tiny-finetuned")
print("Training done.")

pruning

In [None]:
# Install if not: already in Cell 1
from optipfair import prune_model

# Prune 20% MLP neurons (minimal WER drop)
pruned_model, stats = prune_model(
    model=model,
    pruning_type="MLP_GLU",  # For Whisper's GLU
    neuron_selection_method="MAW",  # Magnitude-based
    pruning_percentage=20,
    show_progress=True,
    return_stats=True
)
print(f"Pruned: {stats['percentage_reduction']:.2f}% params reduced")
pruned_model.save_pretrained("./swahili-whisper-tiny-pruned")
# Re-load for inference if needed; improves RTF for edge

Load Zindi Test Dataset and Generate Submission

In [None]:
# Load test set (no transcripts; 4089 samples)
test_ds = load_dataset("sartifyllc/Sartify_ITU_Zindi_Testdataset", split="test")
test_ds = test_ds.cast_column("audio", Audio(decode=True, sampling_rate=16000))  # Decode and resample

# Prepare test features (map like train)
def prepare_test(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    return batch

test_ds = test_ds.map(prepare_test, num_proc=1)
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=4, collate_fn=data_collator)  # Batch=4 safe <4GB

# Inference
model.eval()
predictions = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_features = batch["input_features"].to("cuda")
        generated_ids = model.generate(input_features, max_new_tokens=448, do_sample=False)  # No beam for speed
        pred_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        predictions.extend(pred_str)

In [None]:
# Model configuration
model_name = "sartifyllc/pawa-min-alpha"
max_seq_length = 2048

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,  # Use float16 for efficiency
    device_map="auto",  # Automatically handle device placement
    trust_remote_code=True  # In case the model requires custom code
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

    # Configure chat template (ChatML format)
def apply_chatml_template(messages):
    """Apply ChatML template to conversation messages"""
    formatted_text = ""
    for message in messages:
        role = message.get("role", message.get("from", ""))
        content = message.get("content", message.get("value", ""))

        # Map role names
        if role == "human" or role == "user":
            role = "user"
        elif role == "gpt" or role == "assistant":
            role = "assistant"
        elif role == "system":
            role = "system"

        formatted_text += f"<|im_start|>{role}\n{content}<|im_end|>\n"

    # Add assistant start token for generation
    formatted_text += "<|im_start|>assistant\n"
    return formatted_text

    # Example usage function
def generate_response(messages, max_new_tokens=64):
    """Generate response for a conversation"""

    # Format the conversation with ChatML template
    if isinstance(messages, str):
        # If single string input, treat as user message
        messages = [{"role": "user", "content": messages}]

    formatted_input = apply_chatml_template(messages)

    # Tokenize input
    inputs = tokenizer(
        formatted_input,
        return_tensors="pt",
        max_length=max_seq_length,
        truncation=True,
        padding=True
    )

    # Move to same device as model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

     # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True
        )

    # Decode and return response
    response = tokenizer.batch_decode(outputs, skip_special_tokens=False)
    return response


# Example usage:
if __name__ == "__main__":
    # Method 1: Manual approach
    output = asr_model.transcribe(['./first_row_audio.wav'])
    message = output[0].text

    refs = [first_row["sentence"], first_row["sentence"]]
    hyps = [message, message]
    batch_wer = calculate_batch_wer(refs, hyps)
    print(f"Batch WER: {batch_wer:.3f}")

In [None]:
# Create submission CSV
submission = pd.DataFrame({"filename": [f"{item['record_id']}.wav" for item in test_ds], "text": predictions})
submission.to_csv("submission.csv", index=False)
print("Submission saved: submission.csv | Sample:")
print(submission.head())
# Expected WER: Compute on CV val; for Zindi, submit to leaderboard

In [None]:
end_reserved = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
delta_reserved = round(end_reserved - start_reserved, 3)
percent_used = round(end_reserved / max_memory * 100, 3)
percent_delta = round(delta_reserved / max_memory * 100, 3)

print(f"\nüìà Peak reserved memory after Inference: {end_reserved} GB")
print(f"üìâ Additional memory used for Inference: {delta_reserved} GB")
print(f"üíØ Total memory used (%): {percent_used} %")
print(f"üß† Inference memory usage (%): {percent_delta} %")

def measure_rtf(audio_duration, inference_time):
    rtfx = audio_duration / inference_time
    print(f"Real-Time Factor: {rtfx:.3f}")
    return rtfx