In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import torchaudio
import pandas as pd
import torch.multiprocessing as mp
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, get_scheduler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Get audio directory and cv-valid-train.csv
audio_dir = "../datasets/"
train_df = pd.read_csv('../datasets/cv-valid-train.csv')
train_df = train_df[['filename', 'text']]

In [None]:
# Initialize the processor and model
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### This section is to preprocess the training dataset

In [None]:
# 1. Optimized Audio Preprocessing (Using torchaudio + GPU Support)
def preprocess_audio(file_path):
    """Load, resample, and convert audio to tensor format"""
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample if necessary
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

    waveform = waveform.squeeze()  # Remove extra channel dimension if necessary
    waveform = waveform.to("cuda" if torch.cuda.is_available() else "cpu")  # Move to GPU if available

    # Process with Hugging Face Wav2Vec2 processor
    audio = processor(waveform.to("cpu"), sampling_rate=16000, return_tensors="pt", padding=True)

    return audio.input_values.squeeze()  # Return processed tensor

# 2. Optimized Text Preprocessing
def preprocess_text(text):
    """Tokenize text labels"""
    # Convert text to uppercase as processor's vocab labels are uppercase
    labels = processor.tokenizer(text.upper(), return_tensors="pt").input_ids.squeeze()
    return labels

# 3. Mapping Function for Multiprocessing (no lambda here)
def map_to_dataset(row, audio_dir):
    """Process audio and text for parallel execution"""
    #print(f"Processing row: {row['filename']}")
    audio_path = os.path.join(audio_dir, row['filename'])
    input_values = preprocess_audio(audio_path)  # Audio Processing
    labels = preprocess_text(row['text'])  # Text Tokenization (of given transcript)

    return {'input_values': input_values, 'labels': labels}

# 4. Parallel Processing using ProcessPoolExecutor
def preprocess_data_parallel(data, audio_dir, num_workers=4):
    """Process data in parallel using multiprocessing"""
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(executor.map(process_row, data, [audio_dir]*len(data)))

    return results

# Helper function to avoid lambda
def process_row(row, audio_dir):
    return map_to_dataset(row, audio_dir)

In [None]:
# Process a subset of 10k rows of data
df_subset = train_df[:10000].copy()

# Convert DataFrame to a list of dictionaries
mp.set_start_method('spawn', force=True)
dataset = df_subset.to_dict(orient='records')
train_data, val_data = train_test_split(dataset, test_size=0.3, random_state=42)

# Process training and validation data
train_processed_data = preprocess_data_parallel(train_data, audio_dir)
val_processed_data = preprocess_data_parallel(val_data, audio_dir)


In [None]:
# Save training and validation data
#torch.save(train_processed_data, '/content/drive/My Drive/asr-train/train_processed_data.pt')
#torch.save(val_processed_data, '/content/drive/My Drive/asr-train/val_processed_data.pt')
#print("✅ Processed dataset saved!")


### This section is to fine-tune the model 

In [None]:
# Load saved data (instead of reprocessing large amounts of data every session)
#train_processed_data = torch.load('/content/drive/My Drive/train_processed_data.pt')
#val_processed_data = torch.load('/content/drive/My Drive/val_processed_data.pt')
#print("✅ Processed dataset loaded!")

In [None]:
def data_collator(batch):
    # Pad audio sequences (input_values) in the batch
    input_values = [item['input_values'] for item in batch]
    input_values_padded = pad_sequence(input_values, batch_first=True, padding_value=0)

    # Pad text sequences (labels) in the batch
    labels = [item['labels'] for item in batch]
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

    return {'input_values': input_values_padded, 'labels': labels_padded}


class AudioTextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Each item is a dictionary {'input_values': ..., 'labels': ...}
        return self.data[idx]

In [None]:
# Create DataLoader for batching and shuffling
train_dataset = AudioTextDataset(train_processed_data)
val_dataset = AudioTextDataset(val_processed_data)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=data_collator)

In [None]:
# Optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)

# Learning rate scheduler
num_epochs = 5
num_training_steps = num_epochs * len(train_loader)
num_warmup_steps = int(0.1 * num_training_steps)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

In [None]:
# Move model to device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")
model.to(device)

train_loss_list = []
val_loss_list = []

# Gradient checkpointing to reduce memory usage
model.gradient_checkpointing_enable()

# GradScaler for mixed precision training
scaler = GradScaler()

# Directory to save model checkpoints
checkpoint_dir = "./test_checkpoints/"
os.makedirs(checkpoint_dir, exist_ok=True)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_train_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        torch.cuda.empty_cache() # Free memory before processing each batch
        input_values = batch["input_values"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # Mixed precision
        with autocast():
            # Forward pass
            outputs = model(input_values, labels=labels)
            loss = outputs.loss

        # Backward pass
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevent exploding gradient by clipping
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        running_train_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    # Validation loop
    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_values = batch["input_values"].to(device)
            labels = batch["labels"].to(device)

            with autocast():
                outputs = model(input_values, labels=labels)
                running_val_loss += outputs.loss.item()

    # Calculate average training loss for the epoch
    avg_train_loss = running_train_loss / len(train_loader)
    train_loss_list.append(avg_train_loss)

    # Calculate average validation loss for the epoch
    avg_val_loss = running_val_loss / len(val_loader)
    val_loss_list.append(avg_val_loss)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # **Save model checkpoint**
    checkpoint_path = f"{checkpoint_dir}/model_epoch_{epoch+1}.pth"
    torch.save({
        'epoch': epoch+1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scaler_state_dict': scaler.state_dict(),
        'train_loss': avg_train_loss,
        'val_loss': avg_val_loss
    }, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")


Using device: cuda


  scaler = GradScaler()
  with autocast():
Epoch 1/5: 100%|██████████| 1750/1750 [27:41<00:00,  1.05it/s, loss=113]
  with autocast():


Epoch 1/5
Training Loss: -115.7268
Validation Loss: -133.2053
Checkpoint saved: checkpoints/model_epoch_1.pth


Epoch 2/5: 100%|██████████| 1750/1750 [13:44<00:00,  2.12it/s, loss=-370]


Epoch 2/5
Training Loss: inf
Validation Loss: -133.4147
Checkpoint saved: checkpoints/model_epoch_2.pth


Epoch 3/5: 100%|██████████| 1750/1750 [13:32<00:00,  2.15it/s, loss=-224]


Epoch 3/5
Training Loss: -162.2955
Validation Loss: -143.6807
Checkpoint saved: checkpoints/model_epoch_3.pth


Epoch 4/5: 100%|██████████| 1750/1750 [13:28<00:00,  2.16it/s, loss=-161]


Epoch 4/5
Training Loss: -169.4942
Validation Loss: -135.3436
Checkpoint saved: checkpoints/model_epoch_4.pth


Epoch 5/5: 100%|██████████| 1750/1750 [13:28<00:00,  2.16it/s, loss=-323]


Epoch 5/5
Training Loss: -176.0803
Validation Loss: -138.2690
Checkpoint saved: checkpoints/model_epoch_5.pth


In [None]:
# Save the fine-tuned model
model.save_pretrained("./wav2vec2-large-960h-cv")
print("✅ Fine-tuned model saved!")

In [None]:
# Function to transcribe audio for test dataset using fine-tuned model
def transcribe_audio(file_path):
    audio_path = os.path.join(audio_dir, file_path)
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample if necessary
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

    waveform = waveform.squeeze()
    waveform = waveform.to("cuda" if torch.cuda.is_available() else "cpu")

    # Convert to tensor using processor
    input_values = processor(waveform.to("cpu"), return_tensors="pt", sampling_rate=16000).input_values

    # Perform inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Get predicted transcription
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    print(f"Audio for {file_path} transcribed!")

    return transcription

# Function to transcribe a single file
def transcribe_file(index, row):
    file_path = row["filename"]
    transcription = transcribe_audio(file_path)  # I/O operation
    return index, transcription.lower()

### This section is to evaluate model on cv-valid-test

In [4]:
# Load test data
test_df = pd.read_csv('../datasets/cv-valid-test.csv')

In [None]:
# Use ThreadPoolExecutor for parallel processing (best for I/O-bound tasks)
with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust based on CPU
    results = list(executor.map(lambda x: transcribe_file(*x), test_df.iterrows()))

# Update DataFrame with transcriptions
for index, transcription in results:
    test_df.at[index, 'predicted_transcription'] = transcription

# Save results
test_df.to_csv('../datasets/cv-valid-test.csv', index=False)


In [None]:
from jiwer import wer, cer

# Calculate WER and CER
wer_value = wer(test_df["text"].tolist(), test_df["predicted_transcription"].tolist())
cer_value = cer(test_df["text"].tolist(), test_df["predicted_transcription"].tolist())
print("cv-valid-test:")
print(f"Word Error Rate (WER): {wer_value * 100:.2f}%")
print(f"Character Error Rate (CER): {cer_value * 100:.2f}%")


cv-valid-test:

Word Error Rate (WER): 7.96%
Character Error Rate (CER): 3.49%


Evaluating the model on cv-valid-test:

Word Error Rate (WER): 7.96% <br>
Character Error Rate (CER): 3.49%

### For Question  4


In [None]:
# Use model to predict transcriptions on cv-valid-dev
dev_df = pd.read_csv('../datasets/cv-valid-dev.csv')

In [None]:
# Use ThreadPoolExecutor for parallel processing (best for I/O-bound tasks)
with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust based on CPU
    results = list(executor.map(lambda x: transcribe_file(*x), dev_df.iterrows()))

# Update DataFrame with transcriptions
for index, transcription in results:
    dev_df.at[index, 'predicted_transcription'] = transcription

# Save results
dev_df.to_csv('../datasets/cv-valid-dev.csv', index=False)

In [38]:
# Calculate WER and CER of generated text from Task 2 and this task

# Remove any empty rows
dev_df = dev_df.dropna(subset=['generated_text','predicted_transcription'])

# Generated text from pretrained model
pretrained_wer_value = wer(dev_df["text"].tolist(), dev_df["generated_text"].tolist())
pretrained_cer_value = cer(dev_df["text"].tolist(), dev_df["generated_text"].tolist())
print("Results from Task 2a for the cv-valid-dev (pretrained model) :")
print(f"Word Error Rate (WER): {pretrained_wer_value * 100:.2f}%")
print(f"Character Error Rate (CER): {pretrained_cer_value * 100:.2f}%")

# Predicted transcription from pretrained model
finetuned_wer_value = wer(dev_df["text"].tolist(), dev_df["predicted_transcription"].tolist())
finetuned_cer_value = cer(dev_df["text"].tolist(), dev_df["predicted_transcription"].tolist())
print("\nResults from Task 4 for the cv-valid-dev (finetuned model) :")
print(f"Word Error Rate (WER): {finetuned_wer_value * 100:.2f}%")
print(f"Character Error Rate (CER): {finetuned_cer_value * 100:.2f}%")

Results from Task 2a for the cv-valid-dev (pretrained model) :
Word Error Rate (WER): 10.81%
Character Error Rate (CER): 4.52%

Results from Task 4 for the cv-valid-dev (finetuned model) :
Word Error Rate (WER): 8.03%
Character Error Rate (CER): 3.43%


Results from Task 2a for the cv-valid-dev (pretrained model):

Word Error Rate (WER): 10.81% <br>
Character Error Rate (CER): 4.52%

<br>
Results from Task 4 for the cv-valid-dev (finetuned model):

Word Error Rate (WER): 8.03% <br>
Character Error Rate (CER): 3.43%