In [None]:
# cv-train-2a.ipynb

# 🧪 1. Setup
# %pip install datasets transformers torchaudio jiwer evaluate --quiet

In [1]:
# 📦 2. Imports
import os
import pandas as pd
import torchaudio
from datasets import Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate

In [2]:
# 🧹 3. Load CSV
csv_path = "../asr/common_voice/cv-valid-train.csv"
audio_dir = "../asr/common_voice/cv-valid-train"
df = pd.read_csv(csv_path)
df["audio_path"] = df["filename"].apply(lambda x: os.path.join(audio_dir, x))

In [3]:
df.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,audio_path
0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,,../asr/common_voice/cv-valid-train/cv-valid-tr...
1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,,../asr/common_voice/cv-valid-train/cv-valid-tr...
2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,,../asr/common_voice/cv-valid-train/cv-valid-tr...
3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,,../asr/common_voice/cv-valid-train/cv-valid-tr...
4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,,../asr/common_voice/cv-valid-train/cv-valid-tr...


In [4]:
# ✂️ 4. Train/Validation Split
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

In [5]:
# 📚 5. Wrap as HuggingFace Datasets
def df_to_dataset(df):
    return Dataset.from_dict({
        "path": df["audio_path"].tolist(),
        "sentence": df["text"].tolist(),
    })

train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df)

In [15]:
# 🔊 6. Preprocessing Functions
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

from pydub import AudioSegment
import os

def convert_all_mp3_to_wav(folder):
    for file in os.listdir(folder):
        if file.endswith(".mp3"):
            mp3_path = os.path.join(folder, file)
            wav_path = mp3_path.replace(".mp3", ".wav")
            audio = AudioSegment.from_mp3(mp3_path)
            audio.export(wav_path, format="wav")

convert_all_mp3_to_wav("../asr/common_voice/cv-valid-train/cv-valid-train")

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.


OSError: [Errno 28] No space left on device: '../asr/common_voice/cv-valid-train/cv-valid-train/sample-144461.wav'

In [None]:
def preprocess(batch):
    try:
        speech_array, sampling_rate = torchaudio.load(batch["path"])
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
            speech_array = resampler(speech_array)
        batch["input_values"] = processor(speech_array.squeeze().numpy(), sampling_rate=16000).input_values[0]
        batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
        return batch
    except Exception as e:
        print(f"Skipping file {batch['path']}: {e}")
        return {"input_values": None, "labels": None}

train_ds = train_ds.map(preprocess, num_proc=4, load_from_cache_file=False)
val_ds = val_ds.map(preprocess, num_proc=4, load_from_cache_file=False)

In [None]:
# 🧠 7. Load Model
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

In [None]:
# ⚙️ 8. Training Arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-large-960h-cv",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
)

In [None]:
# 📏 9. Metric
wer = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    return {"wer": wer.compute(predictions=pred_str, references=label_str)}

In [None]:
# 🏋️ 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# 💾 11. Save Fine-Tuned Model
model.save_pretrained("./wav2vec2-large-960h-cv")
processor.save_pretrained("./wav2vec2-large-960h-cv")

In [None]:
# 🔍 12. Load and Transcribe Test Set
test_csv = "../common_voice/cv-valid-test/cv-valid-test.csv"
test_dir = "../common_voice/cv-valid-test"
test_df = pd.read_csv(test_csv)
test_df["audio_path"] = test_df["path"].apply(lambda x: os.path.join(test_dir, x))

def transcribe(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path)
    inputs = processor(speech_array.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids)[0]
    return transcription

test_df["generated_text"] = test_df["audio_path"].apply(transcribe)

test_df.to_csv("cv-valid-test-with-predictions.csv", index=False)
print("✅ Test predictions saved to cv-valid-test-with-predictions.csv")