In [68]:
import os
import torch
import librosa
import torchaudio
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2Tokenizer, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt



In [None]:
proj_dir = r"C:\Users\evann\Documents\GitHub\ASR"
csv_file = os.path.join(proj_dir, "datasets", "cv-valid-train.csv")
audio_dir = os.path.join(proj_dir, "datasets")

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file)

# Initialize the processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Preprocess Data

In [None]:
# Ensure the sample rate is 16kHz and audio is in the correct format
def preprocess_audio(file_path):
    # Load the MP3 file and resample it to 16kHz
    waveform, sample_rate = librosa.load(file_path, sr=16000) 
    
    # Convert the waveform to a PyTorch tensor (as librosa returns a numpy array)
    waveform = torch.tensor(waveform)

    audio = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

    input_values = audio.input_values.squeeze() 
    
    return input_values

In [59]:
# This function will be used to preprocess the text labels
def preprocess_text(text):
    labels = processor.tokenizer(text, return_tensors="pt").input_ids.squeeze()
    return labels


In [60]:
# Function to map each sample to audio and text
def map_to_dataset(row):
    # Load the audio and preprocess it
    audio_path = os.path.join(audio_dir, row['filename'])
    input_values = preprocess_audio(audio_path)
    labels = preprocess_text(row['text'])
  
    return {
        'input_values': input_values,
        'labels': labels
    }

In [None]:
# Apply the mapping to the dataframe
df = df.apply(map_to_dataset, axis=1)


In [None]:
# Split the DataFrame into 70% training and 30% validation
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

### Create PyTorch Dataset for fine tuning

In [None]:
class SpeechDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        input_values = torch.tensor(self.dataset[idx]["input_values"])
        labels = torch.tensor(self.dataset[idx]["labels"])
        return {"input_values": input_values, "labels": labels}

# Create DataLoader instances
train_loader = DataLoader(SpeechDataset(train_df), batch_size=16, shuffle=True)
val_loader = DataLoader(SpeechDataset(val_df), batch_size=16)

### Define optimizer and loss function

In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

# Set up optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)

# Set up the learning rate scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


### Fine tune the model and save as "wav2vec2-large-960h-cv"

In [None]:
import torch
from tqdm import tqdm

# Move model to device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

train_loss_list = []
val_loss_list = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_train_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch in progress_bar:
        input_values = batch["input_values"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass
        outputs = model(input_values, labels=labels)
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        running_train_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    # Validation loop
    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_values = batch["input_values"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_values, labels=labels)
            running_val_loss += outputs.loss.item()

    # Calculate average training loss for the epoch
    avg_train_loss = running_train_loss / len(train_loader)
    train_loss_list.append(avg_train_loss)  # Store the training loss for this epoch
    
    # Calculate average validation loss for the epoch
    avg_val_loss = running_val_loss / len(val_loader)
    val_loss_list.append(avg_val_loss)  # Store the validation loss for this epoch
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}")


In [None]:
# Save the model
model.save_pretrained("wav2vec2-large-960h-cv")


### Evaluate the model on the test dataset

In [None]:
# Load the test dataset
test_dataset = load_dataset("common_voice", "en", split='test[:100%]')

# Preprocess the test dataset
test_dataset = test_dataset.map(preprocess_data, remove_columns=["audio", "sentence"])

# Create DataLoader for the test dataset
test_loader = DataLoader(SpeechDataset(test_dataset), batch_size=16)

# Evaluate on the test set
model.eval()
for batch in test_loader:
    input_values = batch["input_values"].to(device)
    labels = batch["labels"].to(device)
    
    with torch.no_grad():
        outputs = model(input_values, labels=labels)
        predicted_ids = torch.argmax(outputs.logits, dim=-1)

    # Convert predicted_ids to text and compare with actual labels
    # Compute metrics like WER (Word Error Rate) for evaluation


### Visualize metrics

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss curves
plt.plot(train_loss_list, label="Training Loss")
plt.plot(val_loss_list, label="Validation Loss")
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.show()
