# Nigerian Accented English ASR - Main Notebook

This notebook handles:
1. Setup and Authentication
2. Downloading YouTube videos as audio
3. Loading the NCAIR1/NigerianAccentedEnglish model
4. Testing the model
5. Quantization and ONNX Conversion

In [None]:
# Install dependencies
!pip install -q yt-dlp torch torchaudio transformers librosa optimum onnx onnxruntime accelerate

In [None]:
from huggingface_hub import login

# Login to Hugging Face (required for gated model)
login(new_session=False)

In [None]:
import torch
import librosa
import yt_dlp
import os

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

# Initialize pipeline (optional usage)
# pipe = pipeline("automatic-speech-recognition", model="NCAIR1/NigerianAccentedEnglish", device=device)

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

print("Loading model...")
processor = AutoProcessor.from_pretrained("NCAIR1/NigerianAccentedEnglish")
model = AutoModelForSpeechSeq2Seq.from_pretrained("NCAIR1/NigerianAccentedEnglish")
model.to(device)
print("Model loaded.")

In [None]:
def download_youtube_audio(url, output_name="test_audio"):
    print(f"Downloading audio from {url}...")
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'outtmpl': output_name,
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    
    return f"{output_name}.wav"

def transcribe_audio(audio_path):
    print(f"Transcribing {audio_path}...")
    # Load audio
    audio, sr = librosa.load(audio_path, sr=16000)
    
    # Process
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt").to(device)
    
    # Generate
    with torch.no_grad():
        generated_ids = model.generate(inputs.input_features)
    
    # Decode
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription

In [None]:
# Example usage:
# url = "YOUR_YOUTUBE_URL_HERE"
# audio_file = download_youtube_audio(url)
# text = transcribe_audio(audio_file)
# print("Transcription:", text)

## Quantization and ONNX Conversion

In [None]:
# Dynamic Quantization (PyTorch)
print("Quantizing model (PyTorch)...")
model_cpu = model.cpu()
quantized_model = torch.quantization.quantize_dynamic(
    model_cpu,
    {torch.nn.Linear},
    dtype=torch.qint8
)
torch.save(quantized_model.state_dict(), "quantized_model.pth")
print("Saved quantized_model.pth")

In [None]:
# Convert to ONNX
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq

print("Converting to ONNX...")
ort_model = ORTModelForSpeechSeq2Seq.from_pretrained(
    "NCAIR1/NigerianAccentedEnglish",
    export=True,
    provider="CPUExecutionProvider"
)

ort_model.save_pretrained("onnx_models")
processor.save_pretrained("onnx_models")
print("Saved ONNX models to onnx_models/")

In [None]:
# Quantize ONNX for Mobile
from onnxruntime.quantization import quantize_dynamic, QuantType
from pathlib import Path

print("Quantizing ONNX models for mobile...")
encoder_path = Path("onnx_models/encoder_model.onnx")
decoder_path = Path("onnx_models/decoder_model.onnx")

if encoder_path.exists():
    quantize_dynamic(
        str(encoder_path),
        "onnx_models/encoder_model_quantized.onnx",
        weight_type=QuantType.QUInt8
    )
    print("Quantized encoder.")

if decoder_path.exists():
    quantize_dynamic(
        str(decoder_path),
        "onnx_models/decoder_model_quantized.onnx",
        weight_type=QuantType.QUInt8
    )
    print("Quantized decoder.")