<a href="https://colab.research.google.com/github/e-apprentice/Fire-Detection/blob/main/ASRModelTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import random
import string
import numpy as np
import torch
import torchaudio
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union
from pathlib import Path

In [None]:
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)
from datasets import Dataset, DatasetDict
import evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/ASR_Project/asr_nepali.zip -d /content/drive/MyDrive/ASR_Project/


unzip:  cannot find or open /content/drive/MyDrive/ASR_Project/asr_nepali.zip, /content/drive/MyDrive/ASR_Project/asr_nepali.zip.zip or /content/drive/MyDrive/ASR_Project/asr_nepali.zip.ZIP.


In [None]:
# Step 1b: Find the ZIP file location

from google.colab import drive
import zipfile
import os

# Mount Google Drive
drive.mount('/content/drive')

# Let's search for the zip file in Google Drive
def find_zip_file(directory):
    """Recursively search for asr_nepali_0.zip file"""
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'asr_nepali_0.zip':
                return os.path.join(root, file)
    return None

print("Searching for asr_nepali_0.zip in Google Drive...")
zip_path = find_zip_file('/content/drive')

if zip_path:
    print(f"✓ Found ZIP file at: {zip_path}")

    # Extract the zip file to the current directory
    print("Extracting ZIP file...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('/content/')

    # List the contents to see what was extracted
    print("\nContents of /content/:")
    for item in os.listdir('/content/'):
        print(f"  {item}")

    # Check if the extraction created the expected folder
    if os.path.exists('/content/asr_nepali'):
        print("\n✓ asr_nepali folder found!")
        print("Contents of asr_nepali folder:")
        for item in os.listdir('/content/asr_nepali'):
            print(f"  {item}")
    else:
        print("\n❌ asr_nepali folder not found. Let's check what folders were created:")
        for item in os.listdir('/content/'):
            if os.path.isdir(f'/content/{item}') and not item.startswith('.'):
                print(f"  Directory: {item}")
else:
    print("❌ ZIP file not found in Google Drive")
    print("Please check that you've uploaded 'asr_nepali_0.zip' to your Google Drive")
    print("\nLet's see what files are in your Google Drive root:")
    for item in os.listdir('/content/drive/MyDrive'):
        print(f"  {item}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Searching for asr_nepali_0.zip in Google Drive...
✓ Found ZIP file at: /content/drive/MyDrive/ASR_Project/asr_nepali_0.zip
Extracting ZIP file...

Contents of /content/:
  .config
  asr_nepali
  drive
  sample_data

✓ asr_nepali folder found!
Contents of asr_nepali folder:
  LICENSE
  data
  utt_spk_text.tsv


In [None]:
# Step 2: Replace your NepaliASRDataset class with this corrected version

SAMPLING_RATE = 16000
MAX_FRAMES = SAMPLING_RATE * 5  # 5 seconds max
UNK_TOKEN = '__UNK__'
PAD_TOKEN = '__PAD__'
WORD_DELIMITER = '|'
LARGE_NEG = -100

class NepaliASRDataset:
    """Custom dataset class for loading local Nepali ASR data"""

    def __init__(self, data_folder: str):
        self.data_folder = Path(data_folder)
        self.data = []
        self.load_data()

    def load_data(self):
        """Load audio files and transcriptions from the data folder"""
        print(f"Loading data from {self.data_folder}")

        # Look for TSV file containing transcriptions
        tsv_files = list(self.data_folder.glob("*.tsv"))
        if not tsv_files:
            raise ValueError("No TSV file found in the data folder")

        tsv_file = tsv_files[0]
        print(f"Found TSV file: {tsv_file}")

        # Read TSV file
        with open(tsv_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        # Parse TSV (assuming format: file_id\tspeaker_id\ttranscription)
        for line in lines[1:]:  # Skip header
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                file_id = parts[0]
                speaker_id = parts[1]
                transcription = parts[2]

                # Find corresponding audio file
                audio_file = self.find_audio_file(file_id)
                if audio_file:
                    self.data.append({
                        'utterance_id': file_id,
                        'speaker_id': speaker_id,
                        'transcription': transcription,
                        'audio_path': str(audio_file)
                    })

        print(f"Loaded {len(self.data)} samples")

    def find_audio_file(self, file_id: str):
        """Find audio file for given file_id"""
        # Look for various audio formats
        extensions = ['.flac', '.wav', '.mp3']

        # Search in all subdirectories
        for ext in extensions:
            audio_files = list(self.data_folder.rglob(f"{file_id}{ext}"))
            if audio_files:
                return audio_files[0]

        return None

    def to_hf_dataset(self):
        """Convert to Hugging Face dataset format"""
        def load_audio(example):
            waveform, sample_rate = torchaudio.load(example['audio_path'])

            # Resample if necessary
            if sample_rate != SAMPLING_RATE:
                waveform = torchaudio.functional.resample(
                    waveform, sample_rate, SAMPLING_RATE
                )

            # Convert to mono if stereo
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Remove VAD function as it's not available in recent torchaudio versions
            # Just use the waveform as is
            waveform = waveform.squeeze()

            example['utterance'] = {
                'array': waveform.numpy(),
                'sampling_rate': SAMPLING_RATE
            }
            example['num_frames'] = len(waveform)

            return example

        dataset = Dataset.from_list(self.data)
        dataset = dataset.map(load_audio, remove_columns=['audio_path'])

        return dataset

In [None]:
# Step 3: Replace your preprocessing functions with these corrected versions

def preprocess_text(dataset):
    """Clean and preprocess transcriptions"""
    print("Preprocessing text...")

    # Remove examples with English characters
    def check_english_chars(text):
        return any([c in text for c in string.ascii_letters])

    dataset = dataset.filter(
        lambda ex: not check_english_chars(ex['transcription']),
        desc="Filtering English characters"
    )

    # Remove special characters
    remove_chars = ['!', '%', '.', ';', '?', '\\', '।', '\xa0', '\u200c', '\u200d', '\u200e', '\u200f', '"']

    def remove_special_characters(row):
        row['transcription'] = ''.join(
            [c for c in row['transcription'] if c not in remove_chars]
        ).strip()
        return row

    dataset = dataset.map(remove_special_characters, desc="Removing special characters")

    # Filter by audio length
    dataset = dataset.filter(
        lambda ex: ex['num_frames'] < MAX_FRAMES,
        desc="Filtering by audio length"
    )

    print(f"After preprocessing: {len(dataset)} samples")
    return dataset

def create_vocabulary(dataset):
    """Create vocabulary from transcriptions"""
    print("Creating vocabulary...")

    def extract_all_chars(batch):
        all_text = " ".join(batch["transcription"])
        vocab = list(set(all_text))
        return {"vocab": [vocab]}

    vocab_all = dataset.map(
        extract_all_chars,
        batched=True,
        batch_size=-1,
        keep_in_memory=True,
        remove_columns=dataset.column_names
    )

    vocab_list = sorted(list(set(vocab_all["vocab"][0])))
    vocab_list = [PAD_TOKEN, UNK_TOKEN, *vocab_list]

    vocab_dict = {v: k for k, v in enumerate(vocab_list)}
    vocab_dict[WORD_DELIMITER] = vocab_dict[" "]
    del vocab_dict[" "]

    print(f"Vocabulary size: {len(vocab_dict)}")
    return vocab_dict

def create_processor(vocab_dict):
    """Create tokenizer and feature extractor"""
    print("Creating processor...")

    # Save vocabulary
    with open('vocab.json', 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)

    # Create tokenizer
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
        "./",
        unk_token=UNK_TOKEN,
        pad_token=PAD_TOKEN,
        word_delimiter_token=WORD_DELIMITER
    )

    # Create feature extractor
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=SAMPLING_RATE,
        padding_value=0.0,
        do_normalize=True,
        return_attention_mask=True
    )

    # Create processor
    processor = Wav2Vec2Processor(
        feature_extractor=feature_extractor,
        tokenizer=tokenizer
    )

    return processor

In [None]:
# Step 4: Initialize and prepare the dataset
print("Step 4: Loading and preparing dataset...")

# Load your dataset
dataset = NepaliASRDataset('/content/asr_nepali').to_hf_dataset()

# Preprocess the dataset
dataset = preprocess_text(dataset)

# Create vocabulary
vocab_dict = create_vocabulary(dataset)

# Create processor
processor = create_processor(vocab_dict)

# Save processor for later use
processor.save_pretrained('./nepali_wav2vec2_processor')

print(f"Dataset loaded with {len(dataset)} samples")
print("Processor saved successfully!")

# Step 5: Split dataset into train/validation
print("\nStep 5: Splitting dataset...")

# Split dataset (80% train, 20% validation)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

# Step 6: Prepare data for training
print("\nStep 6: Preparing data for training...")

def prepare_dataset(batch):
    """Prepare batch for training"""
    # Extract audio arrays
    audio_arrays = [example["array"] for example in batch["utterance"]]

    # Process audio with feature extractor
    batch_features = processor(
        audio_arrays,
        sampling_rate=SAMPLING_RATE,
        padding=True,
        return_tensors="pt"
    )

    # Tokenize transcriptions
    batch["labels"] = processor.tokenizer(
        batch["transcription"],
        padding=True,
        return_tensors="pt"
    ).input_ids

    # Replace pad token id with -100 for loss calculation
    batch["labels"][batch["labels"] == processor.tokenizer.pad_token_id] = -100

    batch["input_values"] = batch_features.input_values
    batch["attention_mask"] = batch_features.attention_mask

    return batch

# Apply preprocessing to datasets
train_dataset = train_dataset.map(
    prepare_dataset,
    batch_size=8,
    batched=True,
    remove_columns=train_dataset.column_names
)

eval_dataset = eval_dataset.map(
    prepare_dataset,
    batch_size=8,
    batched=True,
    remove_columns=eval_dataset.column_names
)

print("Data preparation completed!")

# Step 7: Initialize the model
print("\nStep 7: Initializing Wav2Vec2 model...")

# Load pre-trained Wav2Vec2 model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

# Freeze feature extractor (optional - helps with training stability)
model.freeze_feature_extractor()

print("Model initialized successfully!")
print(f"Model has {model.num_parameters()} parameters")

# Step 8: Setup training arguments
print("\nStep 8: Setting up training configuration...")

training_args = TrainingArguments(
    # Output directory
    output_dir="./wav2vec2-nepali-asr",

    # Training hyperparameters
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=3e-4,
    weight_decay=0.005,
    warmup_steps=500,

    # Logging and evaluation
    logging_steps=50,
    eval_steps=500,
    save_steps=500,
    evaluation_strategy="steps",

    # Other settings
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Mixed precision training (if GPU supports it)
    fp16=True,

    # Disable wandb logging
    report_to=[],

    # Data loading
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
)

print("Training arguments configured!")

# Step 9: Setup data collator
print("\nStep 9: Setting up data collator...")

@dataclass
class DataCollatorCTCWithPadding:
    """Data collator for CTC training"""

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Pad labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # Replace padding with -100 to ignore loss
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

print("Data collator ready!")

# Step 10: Setup evaluation metrics
print("\nStep 10: Setting up evaluation metrics...")

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    """Compute WER metric"""
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Remove special tokens
    pred_str = [" ".join(p.split()) for p in pred_str]
    label_str = [" ".join(l.split()) for l in label_str]

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

print("Evaluation metrics configured!")

# Step 11: Initialize trainer
print("\nStep 11: Initializing trainer...")

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

print("Trainer initialized successfully!")
print("\nAll setup complete! Ready to start training.")
print("Use: trainer.train() to begin training")

# Step 12: Start training (uncomment when ready)
print("\nStep 12: Starting training...")
print("Note: This will take several hours depending on your hardware")

# Uncomment the following line to start training:
# trainer.train()

print("\nTraining commands:")
print("1. To start training: trainer.train()")
print("2. To save the model: trainer.save_model('./final_model')")
print("3. To evaluate: trainer.evaluate()")

Step 4: Loading and preparing dataset...
Loading data from /content/asr_nepali
Found TSV file: /content/asr_nepali/utt_spk_text.tsv


KeyboardInterrupt: 

In [None]:
# Check GPU memory and adjust batch size if needed
import torch

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

    # Clear cache
    torch.cuda.empty_cache()

    # Check available memory
    memory_allocated = torch.cuda.memory_allocated(0) / 1024**3
    memory_reserved = torch.cuda.memory_reserved(0) / 1024**3
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3

    print(f"Memory allocated: {memory_allocated:.2f} GB")
    print(f"Memory reserved: {memory_reserved:.2f} GB")
    print(f"Available memory: {total_memory - memory_reserved:.2f} GB")

    # Recommend batch size based on available memory
    available_memory = total_memory - memory_reserved
    if available_memory < 8:
        print("Recommendation: Use batch_size=2 and gradient_accumulation_steps=8")
        recommended_batch_size = 2
        recommended_grad_accum = 8
    elif available_memory < 12:
        print("Recommendation: Use batch_size=4 and gradient_accumulation_steps=4")
        recommended_batch_size = 4
        recommended_grad_accum = 4
    else:
        print("Recommendation: Use batch_size=8 and gradient_accumulation_steps=2")
        recommended_batch_size = 8
        recommended_grad_accum = 2

    print(f"Effective batch size will be: {recommended_batch_size * recommended_grad_accum}")
else:
    print("No GPU available - training will be very slow on CPU")
    recommended_batch_size = 1
    recommended_grad_accum = 1

No GPU available - training will be very slow on CPU


In [None]:
# CPU-Optimized Training Configuration
print("Setting up CPU-optimized training...")

# Reduce dataset size for faster training on CPU
def reduce_dataset_for_cpu(dataset, max_samples=1000):
    """Reduce dataset size for CPU training"""
    if len(dataset) > max_samples:
        # Select random subset
        indices = np.random.choice(len(dataset), max_samples, replace=False)
        dataset = dataset.select(indices)
    return dataset

# Reduce training and validation datasets
print("Reducing dataset size for CPU training...")
train_dataset_small = reduce_dataset_for_cpu(train_dataset, max_samples=800)
eval_dataset_small = reduce_dataset_for_cpu(eval_dataset, max_samples=200)

print(f"Reduced training samples: {len(train_dataset_small)}")
print(f"Reduced validation samples: {len(eval_dataset_small)}")

# CPU-optimized training arguments
training_args = TrainingArguments(
    # Output directory
    output_dir="./wav2vec2-nepali-asr-cpu",

    # Reduced training parameters for CPU
    num_train_epochs=5,  # Much fewer epochs
    per_device_train_batch_size=1,  # Smallest batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Accumulate gradients
    learning_rate=5e-4,  # Slightly higher learning rate
    weight_decay=0.01,
    warmup_steps=100,  # Fewer warmup steps

    # Logging and evaluation
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,

    # Other settings
    save_total_limit=1,  # Save only best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # CPU settings
    fp16=False,  # No mixed precision on CPU
    dataloader_num_workers=0,
    remove_unused_columns=False,

    # Additional CPU optimizations
    seed=42,
    push_to_hub=False,
    max_steps=500,  # Limit total training steps
)

print("CPU-optimized training configuration ready!")
print("Warning: This will still take several hours and results may not be optimal.")
print("Consider getting GPU access for better performance.")

# Use smaller datasets for CPU training
print("\nTo use CPU training, replace train_dataset and eval_dataset with:")
print("train_dataset = train_dataset_small")
print("eval_dataset = eval_dataset_small")

Setting up CPU-optimized training...
Reducing dataset size for CPU training...


NameError: name 'train_dataset' is not defined

In [None]:
# Continue with CPU training setup
print("Continuing with CPU-optimized training setup...")

# Replace datasets with smaller versions
train_dataset = train_dataset_small
eval_dataset = eval_dataset_small

print(f"Using reduced datasets - Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")

# Step 9: Setup data collator (CPU-optimized)
print("\nStep 9: Setting up data collator...")

@dataclass
class DataCollatorCTCWithPadding:
    """Data collator for CTC training - CPU optimized"""

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Pad labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # Replace padding with -100 to ignore loss
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
print("Data collator ready!")

# Step 10: Setup evaluation metrics
print("\nStep 10: Setting up evaluation metrics...")

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    """Compute WER metric"""
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Remove special tokens and clean up
    pred_str = [" ".join(p.split()) for p in pred_str]
    label_str = [" ".join(l.split()) for l in label_str]

    # Calculate WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

print("Evaluation metrics configured!")

# Step 11: Initialize trainer
print("\nStep 11: Initializing trainer...")

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

print("Trainer initialized successfully!")

# Step 12: Optional - Test training with a few steps first
print("\nStep 12: Testing training setup...")

# Test with just a few steps to ensure everything works
print("Running a quick test with 5 training steps...")

# Create a test training args with very few steps
test_training_args = TrainingArguments(
    output_dir="./test_run",
    max_steps=5,
    per_device_train_batch_size=1,
    logging_steps=1,
    eval_strategy="no",  # No evaluation for test
    save_strategy="no",   # No saving for test
    report_to=[],
    dataloader_num_workers=0,
    fp16=False,
)

# Create test trainer
test_trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=test_training_args,
    train_dataset=train_dataset.select(range(5)),  # Just 5 samples
    tokenizer=processor.feature_extractor,
)

print("Starting test run...")
try:
    test_trainer.train()
    print("✅ Test run successful! Your setup is working correctly.")

    # Now you can proceed with full training
    print("\n" + "="*50)
    print("READY FOR FULL TRAINING!")
    print("="*50)
    print("\nTo start full training, run:")
    print("trainer.train()")
    print("\nThis will take several hours on CPU.")
    print("The training will run for 5 epochs with 500 max steps.")

except Exception as e:
    print(f"❌ Test run failed with error: {e}")
    print("Please check your setup before proceeding.")

# Additional helpful commands
print("\nHelpful commands after training:")
print("1. Save model: trainer.save_model('./final_nepali_asr_model')")
print("2. Evaluate model: trainer.evaluate()")
print("3. Test prediction: See prediction code below")

# Step 13: Prediction function for testing
print("\nStep 13: Prediction function ready...")

def predict_audio(audio_path):
    """Function to test your trained model"""
    # Load audio
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample if needed
    if sample_rate != SAMPLING_RATE:
        waveform = torchaudio.functional.resample(waveform, sample_rate, SAMPLING_RATE)

    # Convert to mono
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Process with feature extractor
    inputs = processor(
        waveform.squeeze().numpy(),
        sampling_rate=SAMPLING_RATE,
        return_tensors="pt"
    )

    # Get model prediction
    with torch.no_grad():
        logits = model(inputs.input_values).logits

    # Decode prediction
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

print("Prediction function ready!")
print("\nExample usage after training:")
print("transcription = predict_audio('path_to_audio.wav')")
print("print(transcription)")

Continuing with CPU-optimized training setup...


NameError: name 'train_dataset_small' is not defined

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.4


In [None]:
# Step 4: Initialize and prepare the dataset (with size reduction)
print("Step 4: Loading and preparing dataset...")

# Load your dataset
dataset = NepaliASRDataset('/content/asr_nepali').to_hf_dataset()

# ===== DATASET SIZE REDUCTION OPTIONS =====

# Option 1: Take only a subset of the data (e.g., 10% of original)
DATASET_FRACTION = 0.1  # Adjust this value (0.1 = 10%, 0.5 = 50%, etc.)
dataset = dataset.select(range(int(len(dataset) * DATASET_FRACTION)))

# Option 2: Filter by audio length (remove very long audio files)
MAX_AUDIO_LENGTH = 10.0  # seconds
def filter_by_length(example):
    audio_length = len(example["utterance"]["array"]) / example["utterance"]["sampling_rate"]
    return audio_length <= MAX_AUDIO_LENGTH

dataset = dataset.filter(filter_by_length)

# Option 3: Filter by text length (remove very long transcriptions)
MAX_TEXT_LENGTH = 100  # characters
def filter_by_text_length(example):
    return len(example["transcription"]) <= MAX_TEXT_LENGTH

dataset = dataset.filter(filter_by_text_length)

# Preprocess the dataset
dataset = preprocess_text(dataset)

# Create vocabulary
vocab_dict = create_vocabulary(dataset)

# Create processor
processor = create_processor(vocab_dict)

# Save processor for later use
processor.save_pretrained('./nepali_wav2vec2_processor')

print(f"Dataset loaded with {len(dataset)} samples")
print("Processor saved successfully!")

# Step 5: Split dataset into train/validation (with smaller validation set)
print("\nStep 5: Splitting dataset...")

# Use smaller validation set to speed up evaluation
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)  # Reduced from 0.2 to 0.1
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Further reduce eval dataset for faster evaluation
if len(eval_dataset) > 500:  # Keep max 500 samples for evaluation
    eval_dataset = eval_dataset.select(range(500))

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

# Step 6: Prepare data for training (optimized)
print("\nStep 6: Preparing data for training...")

def prepare_dataset(batch):
    """Prepare batch for training"""
    # Extract audio arrays
    audio_arrays = [example["array"] for example in batch["utterance"]]

    # Process audio with feature extractor
    batch_features = processor(
        audio_arrays,
        sampling_rate=SAMPLING_RATE,
        padding=True,
        return_tensors="pt"
    )

    # Tokenize transcriptions
    batch["labels"] = processor.tokenizer(
        batch["transcription"],
        padding=True,
        return_tensors="pt"
    ).input_ids

    # Replace pad token id with -100 for loss calculation
    batch["labels"][batch["labels"] == processor.tokenizer.pad_token_id] = -100

    batch["input_values"] = batch_features.input_values
    batch["attention_mask"] = batch_features.attention_mask

    return batch

# Apply preprocessing to datasets with larger batch size for efficiency
train_dataset = train_dataset.map(
    prepare_dataset,
    batch_size=16,  # Increased from 8 to 16
    batched=True,
    remove_columns=train_dataset.column_names
)

eval_dataset = eval_dataset.map(
    prepare_dataset,
    batch_size=16,  # Increased from 8 to 16
    batched=True,
    remove_columns=eval_dataset.column_names
)

print("Data preparation completed!")

# Step 7: Initialize the model (unchanged)
print("\nStep 7: Initializing Wav2Vec2 model...")

# Load pre-trained Wav2Vec2 model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

# Freeze feature extractor (optional - helps with training stability)
model.freeze_feature_extractor()

print("Model initialized successfully!")
print(f"Model has {model.num_parameters()} parameters")

# Step 8: Setup training arguments (optimized for speed)
print("\nStep 8: Setting up training configuration...")

training_args = TrainingArguments(
    # Output directory
    output_dir="./wav2vec2-nepali-asr",

    # Training hyperparameters (reduced for faster training)
    num_train_epochs=10,  # Reduced from 30 to 10
    per_device_train_batch_size=16,  # Increased from 8 to 16 (if GPU memory allows)
    per_device_eval_batch_size=16,   # Increased from 8 to 16
    gradient_accumulation_steps=1,   # Reduced from 2 to 1
    learning_rate=5e-4,  # Slightly increased for faster convergence
    weight_decay=0.01,
    warmup_steps=100,    # Reduced from 500 to 100

    # Logging and evaluation (less frequent)
    logging_steps=100,   # Increased from 50 to 100
    eval_steps=1000,     # Increased from 500 to 1000
    save_steps=1000,     # Increased from 500 to 1000
    evaluation_strategy="steps",

    # Other settings
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Mixed precision training (if GPU supports it)
    fp16=True,

    # Disable wandb logging
    report_to=[],

    # Data loading (optimized)
    dataloader_num_workers=4,  # Increased from 2 to 4
    dataloader_pin_memory=True,

    # Skip evaluation during training for maximum speed (optional)
    # evaluation_strategy="no",  # Uncomment this line to skip evaluation entirely
)

print("Training arguments configured!")

# Step 9: Setup data collator (unchanged)
print("\nStep 9: Setting up data collator...")

@dataclass
class DataCollatorCTCWithPadding:
    """Data collator for CTC training"""

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Pad labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # Replace padding with -100 to ignore loss
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

print("Data collator ready!")

# Step 10: Setup evaluation metrics (unchanged)
print("\nStep 10: Setting up evaluation metrics...")

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    """Compute WER metric"""
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Remove special tokens
    pred_str = [" ".join(p.split()) for p in pred_str]
    label_str = [" ".join(l.split()) for l in label_str]

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

print("Evaluation metrics configured!")

# Step 11: Initialize trainer (unchanged)
print("\nStep 11: Initializing trainer...")

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

print("Trainer initialized successfully!")
print("\nAll setup complete! Ready to start training.")

# Additional optimization suggestions:
print("\n=== SPEED OPTIMIZATION SUMMARY ===")
print(f"Dataset reduced to: {len(train_dataset)} training samples")
print(f"Validation set: {len(eval_dataset)} samples")
print("Key optimizations applied:")
print("1. Reduced dataset size")
print("2. Shorter audio files only")
print("3. Reduced epochs (30 → 10)")
print("4. Increased batch size (8 → 16)")
print("5. Less frequent evaluation and saving")
print("6. Reduced warmup steps")

# Step 12: Start training
print("\nStep 12: Starting training...")
print("Note: Training should be much faster now!")

# Uncomment the following line to start training:
# trainer.train()

print("\nTraining commands:")
print("1. To start training: trainer.train()")
print("2. To save the model: trainer.save_model('./final_model')")
print("3. To evaluate: trainer.evaluate()")

# Quick training option (for testing)
print("\n=== QUICK TEST TRAINING ===")
print("For an even quicker test, you can:")
print("1. Set DATASET_FRACTION = 0.01 (1% of data)")
print("2. Set num_train_epochs = 1")
print("3. Set evaluation_strategy = 'no'")
print("This will give you a very fast training run to test the pipeline!")

Step 4: Loading and preparing dataset...
Loading data from /content/asr_nepali
Found TSV file: /content/asr_nepali/utt_spk_text.tsv
Loaded 9935 samples


Map:   0%|          | 0/9935 [00:00<?, ? examples/s]

Filter:   0%|          | 0/993 [00:00<?, ? examples/s]

Filter:   0%|          | 0/986 [00:00<?, ? examples/s]

Preprocessing text...


Filtering English characters:   0%|          | 0/986 [00:00<?, ? examples/s]

Removing special characters:   0%|          | 0/985 [00:00<?, ? examples/s]

Filtering by audio length:   0%|          | 0/985 [00:00<?, ? examples/s]

After preprocessing: 883 samples
Creating vocabulary...


Map:   0%|          | 0/883 [00:00<?, ? examples/s]

Vocabulary size: 68
Creating processor...
Dataset loaded with 883 samples
Processor saved successfully!

Step 5: Splitting dataset...
Training samples: 794
Validation samples: 89

Step 6: Preparing data for training...


Map:   0%|          | 0/794 [00:00<?, ? examples/s]

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

Data preparation completed!

Step 7: Initializing Wav2Vec2 model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized successfully!
Model has 94425542 parameters

Step 8: Setting up training configuration...




TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
# Step 4: Initialize and prepare the dataset (with size reduction)
print("Step 4: Loading and preparing dataset...")

# Load your dataset
dataset = NepaliASRDataset('/content/asr_nepali').to_hf_dataset()

# ===== DATASET SIZE REDUCTION OPTIONS =====

# Option 1: Take only a very small subset for quick testing
DATASET_FRACTION = 0.005  # 0.5% of original data - VERY SMALL for fast testing
dataset = dataset.select(range(min(500, int(len(dataset) * DATASET_FRACTION))))  # Max 500 samples

# Option 2: Filter by audio length (keep only very short audio files)
MAX_AUDIO_LENGTH = 3.0  # Only 3 seconds max - very short audio
def filter_by_length(example):
    audio_length = len(example["utterance"]["array"]) / example["utterance"]["sampling_rate"]
    return audio_length <= MAX_AUDIO_LENGTH

dataset = dataset.filter(filter_by_length)

# Option 3: Filter by text length (keep only short transcriptions)
MAX_TEXT_LENGTH = 50  # Only 50 characters max - very short text
def filter_by_text_length(example):
    return len(example["transcription"]) <= MAX_TEXT_LENGTH

dataset = dataset.filter(filter_by_text_length)

# Preprocess the dataset
dataset = preprocess_text(dataset)

# Create vocabulary
vocab_dict = create_vocabulary(dataset)

# Create processor
processor = create_processor(vocab_dict)

# Save processor for later use
processor.save_pretrained('./nepali_wav2vec2_processor')

print(f"Dataset loaded with {len(dataset)} samples")
print("Processor saved successfully!")

# Step 5: Split dataset into train/validation (with very small validation set)
print("\nStep 5: Splitting dataset...")

# Use very small validation set to speed up evaluation
train_test_split = dataset.train_test_split(test_size=0.05, seed=42)  # Only 5% for validation
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Further reduce eval dataset for faster evaluation
if len(eval_dataset) > 50:  # Keep max 50 samples for evaluation
    eval_dataset = eval_dataset.select(range(50))

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

# Step 6: Prepare data for training (optimized)
print("\nStep 6: Preparing data for training...")

def prepare_dataset(batch):
    """Prepare batch for training"""
    # Extract audio arrays
    audio_arrays = [example["array"] for example in batch["utterance"]]

    # Process audio with feature extractor
    batch_features = processor(
        audio_arrays,
        sampling_rate=SAMPLING_RATE,
        padding=True,
        return_tensors="pt"
    )

    # Tokenize transcriptions
    batch["labels"] = processor.tokenizer(
        batch["transcription"],
        padding=True,
        return_tensors="pt"
    ).input_ids

    # Replace pad token id with -100 for loss calculation
    batch["labels"][batch["labels"] == processor.tokenizer.pad_token_id] = -100

    batch["input_values"] = batch_features.input_values
    batch["attention_mask"] = batch_features.attention_mask

    return batch

# Apply preprocessing to datasets with larger batch size for efficiency
print("Processing training data...")
train_dataset = train_dataset.map(
    prepare_dataset,
    batch_size=32,  # Increased to 32 for faster processing
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=4  # Use multiple processes if available
)

print("Processing validation data...")
eval_dataset = eval_dataset.map(
    prepare_dataset,
    batch_size=32,  # Increased to 32 for faster processing
    batched=True,
    remove_columns=eval_dataset.column_names,
    num_proc=4  # Use multiple processes if available
)

print("Data preparation completed!")

# Step 7: Initialize the model (unchanged)
print("\nStep 7: Initializing Wav2Vec2 model...")

# Load pre-trained Wav2Vec2 model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

# Freeze feature extractor (optional - helps with training stability)
model.freeze_feature_extractor()

print("Model initialized successfully!")
print(f"Model has {model.num_parameters()} parameters")

# Step 8: Setup training arguments (optimized for speed)
print("\nStep 8: Setting up training configuration...")

training_args = TrainingArguments(
    # Output directory
    output_dir="./wav2vec2-nepali-asr",

    # Training hyperparameters (reduced for faster training)
    num_train_epochs=3,  # Reduced to just 3 epochs for quick testing
    per_device_train_batch_size=8,   # Reduced to 8 for stability
    per_device_eval_batch_size=8,    # Reduced to 8 for stability
    gradient_accumulation_steps=1,   # Keep at 1
    learning_rate=5e-4,  # Slightly increased for faster convergence
    weight_decay=0.01,
    warmup_steps=50,     # Reduced to 50 for small dataset

    # Logging and evaluation (less frequent)
    logging_steps=10,    # Very frequent logging for small dataset
    eval_steps=50,       # Evaluate every 50 steps
    save_steps=50,       # Save every 50 steps

    # Other settings
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Mixed precision training (if GPU supports it)
    fp16=True,

    # Disable wandb logging
    report_to=[],

    # Data loading (optimized)
    dataloader_num_workers=4,  # Increased from 2 to 4
    dataloader_pin_memory=True,

    # Skip evaluation during training for maximum speed (optional)
    # eval_strategy="no",  # Uncomment this line to skip evaluation entirely
)

print("Training arguments configured!")

# Step 9: Setup data collator (unchanged)
print("\nStep 9: Setting up data collator...")

@dataclass
class DataCollatorCTCWithPadding:
    """Data collator for CTC training"""

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Pad labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # Replace padding with -100 to ignore loss
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

print("Data collator ready!")

# Step 10: Setup evaluation metrics (unchanged)
print("\nStep 10: Setting up evaluation metrics...")

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    """Compute WER metric"""
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Remove special tokens
    pred_str = [" ".join(p.split()) for p in pred_str]
    label_str = [" ".join(l.split()) for l in label_str]

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

print("Evaluation metrics configured!")

# Step 11: Initialize trainer (unchanged)
print("\nStep 11: Initializing trainer...")

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

print("Trainer initialized successfully!")
print("\nAll setup complete! Ready to start training.")

# Additional optimization suggestions:
print("\n=== SPEED OPTIMIZATION SUMMARY ===")
print(f"Dataset reduced to: {len(train_dataset)} training samples")
print(f"Validation set: {len(eval_dataset)} samples")
print("Key optimizations applied:")
print("1. EXTREMELY reduced dataset size (0.5% of original, max 500 samples)")
print("2. Very short audio files only (3 seconds max)")
print("3. Very short text only (50 characters max)")
print("4. Reduced epochs (30 → 3)")
print("5. Smaller batch size for stability")
print("6. Frequent logging for small dataset")
print("7. Tiny validation set (50 samples max)")

print(f"\nExpected training time: 5-15 minutes!")
print("This is a minimal test to verify the pipeline works.")

# Step 12: Start training
print("\nStep 12: Starting training...")
print("Note: Training should be much faster now!")

# Uncomment the following line to start training:
# trainer.train()

print("\nTraining commands:")
print("1. To start training: trainer.train()")
print("2. To save the model: trainer.save_model('./final_model')")
print("3. To evaluate: trainer.evaluate()")

# Quick training option (for testing)
print("\n=== QUICK TEST TRAINING ===")
print("For an even quicker test, you can:")
print("1. Set DATASET_FRACTION = 0.01 (1% of data)")
print("2. Set num_train_epochs = 1")
print("3. Set evaluation_strategy = 'no'")
print("This will give you a very fast training run to test the pipeline!")

Step 4: Loading and preparing dataset...
Loading data from /content/asr_nepali
Found TSV file: /content/asr_nepali/utt_spk_text.tsv
