In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
#!pip install -q huggingface_hub

In [None]:
#!pip install evaluate

In [16]:
import transformers
print(transformers.__version__)


4.52.4


In [18]:
from huggingface_hub import login

login(token=" ")


In [19]:
import os
import numpy as np
import librosa
import soundfile as sf
from pathlib import Path

In [None]:
#!pip install -q transformers librosa datasets


In [154]:
# converting mp3 to wav


# input_dir = Path("/content/drive/MyDrive/audio-to-text/vocals")
# output_dir = Path("/content/drive/MyDrive/audio-to-text/hubert_ready_wavs")
# output_dir.mkdir(exist_ok=True)

# for mp3_file in input_dir.glob("*.mp3"):
#     y, sr = librosa.load(mp3_file, sr=16000, mono=True)
#     out_path = output_dir / (mp3_file.stem + ".wav")
#     sf.write(out_path, y, 16000)




In [63]:
import json

lyrics_dict = {}

with open("/content/drive/MyDrive/audio-to-text/lyrics.jsonl", "r") as f:
    for line in f:
        entry = json.loads(line)
        lyrics_dict.update(entry)


In [111]:
# wave_dir has wav files ready for to go

wave_dir = Path("/content/drive/MyDrive/audio-to-text/hubert_ready_wavs")


audio_paths = []
texts = []

for dali_id, lyrics in lyrics_dict.items():
    audio_file = os.path.join(wave_dir, f'vocals_{dali_id}.wav')
    if os.path.isfile(audio_file):
        audio_paths.append(audio_file)
        texts.append(lyrics)
    else:
        print(f'⚠️ Missing audio file for id: {dali_id}')


In [131]:
from datasets import Dataset, Features, Sequence, Value

# first data set construction

sampling_rate = 16000
waveforms = []

for path in audio_paths:
    wav, _ = librosa.load(path, sr=sampling_rate)
    waveforms.append(wav)

# Ensuring all waveforms are float32 NumPy arrays
waveforms = [np.array(wav, dtype=np.float32) for wav in waveforms]

# Defining the dataset schema explicitly
features = Features({
    "audio": Sequence(Value("float32")),
    "text": Value("string")
})

# Creating the dataset with schema enforcement
dataset = Dataset.from_dict({
    "audio": waveforms,
    "text": texts
}, features=features)


In [None]:
type(dataset['audio'][0])


In [156]:
# New chunking of songs

def chunk_audio_and_text(batch):
    chunked_audio = []
    chunked_texts = []

    CHUNK_SIZE = sampling_rate * 10  # 10 seconds

    for waveform, text in zip(batch["audio"], batch["text"]):
        waveform = np.array(waveform, dtype=np.float32)
        total_audio_len = len(waveform)

        # Splitting lyrics into words
        words = text.split()
        total_words = len(words)

        num_chunks = int(np.ceil(total_audio_len / CHUNK_SIZE))
        words_per_chunk = total_words // num_chunks if num_chunks > 0 else total_words

        for i, start_idx in enumerate(range(0, total_audio_len, CHUNK_SIZE)):
            end_idx = min(start_idx + CHUNK_SIZE, total_audio_len)
            chunk = waveform[start_idx:end_idx]
            max_val = np.abs(chunk).max()
            if max_val > 1e-6:
              chunk = chunk / max_val
            else:
    # handling zero or near-zero chunk (skip or set to zeros)
              chunk = chunk
            if len(chunk) < sampling_rate * 2:
                continue

            word_start = i * words_per_chunk
            # all remaining words in last chunk
            if i == num_chunks - 1:
                word_end = total_words
            else:
                word_end = (i + 1) * words_per_chunk

            text_chunk = " ".join(words[word_start:word_end]).strip()
            if len(text_chunk) < 5:  # skip very short chunks
                continue

            chunked_audio.append(chunk)
            chunked_texts.append(text_chunk)

    return {
        "audio": [np.array(a, dtype=np.float32) for a in chunked_audio],
        "text": chunked_texts
    }


In [None]:
# second dataset, to chunk the songs

dataset0 = dataset.map(
    chunk_audio_and_text,
    batched=True,
    remove_columns=["audio", "text"],
    num_proc=4
)


Map (num_proc=4):   0%|          | 0/119 [00:00<?, ? examples/s]

In [150]:
dataset0.save_to_disk("/content/drive/MyDrive/audio-to-text/chunked_dataset")


Saving the dataset (0/4 shards):   0%|          | 0/2814 [00:00<?, ? examples/s]

In [151]:
print(f"✅ Chunked dataset has {len(dataset0)} items.")
print("Type of audio:", type(dataset0[0]["audio"]))  # should be np.ndarray
print(np.min(dataset0[0]["audio"]))
print("Audio length (samples):", len(dataset0[0]["audio"]))
print("Text:", dataset0[0]["text"])

✅ Chunked dataset has 2814 items.
Type of audio: <class 'list'>
-1.0
Audio length (samples): 160000
Text: streets like a jungle so call the police following the herd down to greece on holiday love in the nineties is paranoid


In [117]:
# third dataset, passing everything through the processor

def prepare_dataset(batch):
    # Process audio (list of np.arrays)
    audio_inputs = processor(
        batch["audio"],
        sampling_rate=16000,
        return_attention_mask=False
    )

    # Processing text labels (list of strings)
    with processor.as_target_processor():
        label_inputs = processor.tokenizer(
            batch["text"],
            padding=False,
            return_tensors=None
        )

    return {
        "input_values": audio_inputs["input_values"],
        "labels": label_inputs["input_ids"]
    }



In [134]:
# Instantiating the processor

from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")


dataset1 = dataset0.map(
    prepare_dataset,
    batched=True,
    remove_columns=["audio", "text"],
    num_proc=4
)


Map (num_proc=4):   0%|          | 0/2814 [00:00<?, ? examples/s]



In [119]:
from torch.utils.data import DataLoader

# This code is for debugging, data collator should be instantiated first (it's below)

# Usng your collator and dataset
dl = DataLoader(dataset1, batch_size=1, collate_fn=data_collator)

# Inspecting one batch
batch = next(iter(dl))
print(batch["input_values"].shape)


torch.Size([1, 160000])




In [32]:
dataset.save_to_disk("/content/drive/MyDrive/audio-to-text/chunked_dataset")


Saving the dataset (0/7 shards):   0%|          | 0/2777 [00:00<?, ? examples/s]

In [135]:
# More prinitng for debugging


for i in range(5):
    x = dataset1[i]
    print(f"audio len: {len(x['input_values'])}, label len: {len(x['labels'])}")


audio len: 160000, label len: 118
audio len: 160000, label len: 109
audio len: 160000, label len: 131
audio len: 160000, label len: 133
audio len: 160000, label len: 107


# Loading everything and training

In [20]:
from datasets import load_from_disk

# I didn't do it but I think each iteration of dataset instantiation should probably be saved
# and the lastone should be loaded here

dataset = load_from_disk(" ")


In [125]:
# Defining collator for padding:

from dataclasses import dataclass
from typing import List, Dict, Any
import torch

@dataclass
class DataCollatorCTCWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.pad(
            input_features,
            padding=True,
            return_tensors="pt"
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=True,
                return_tensors="pt"
            )

        # Replacing padding with -100 to ignore loss on padded labels
        labels = labels_batch["input_ids"].masked_fill(labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100)

        batch["labels"] = labels
        return batch


In [136]:
batch = data_collator([dataset1[i] for i in range(2)])

print("Batch input_values shape:", batch["input_values"].shape)
print("Batch labels shape:", batch["labels"].shape)
print("First example labels:", batch["labels"][0])
print("Label contains only -100?:", torch.all(batch["labels"][0] == -100))


Batch input_values shape: torch.Size([2, 160000])
Batch labels shape: torch.Size([2, 118])
First example labels: tensor([3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3,
        4, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3,
        3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3,
        3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3])
Label contains only -100?: tensor(False)




In [127]:
from transformers import TrainingArguments, Trainer, HubertForCTC, Wav2Vec2Processor
from sklearn.model_selection import KFold
import evaluate
import numpy as np
import torch
from transformers import Wav2Vec2Processor, HubertForCTC
from transformers import Wav2Vec2ForCTC

# Hubert processor and model, which I'm not using
# processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
# model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")


data_collator = DataCollatorCTCWithPadding(processor)


# Loadinging processor and base model
base_model_path = "facebook/wav2vec2-base-960h"
model_class = Wav2Vec2ForCTC

# Loading WER metric
wer_metric = evaluate.load("wer")

# Computing metrics function
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Decoding predictions
    pred_str = processor.batch_decode(pred_ids)

    # Decoding references
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}
# Number of folds
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Recording WERs per fold
wer_scores = []


In [128]:
# this code is for debugging, as you see the forward pass returns nothing
# which I have been trying to fix for hours now

model = model_class.from_pretrained(base_model_path)
model.freeze_feature_extractor()

batch = data_collator([dataset1[i] for i in range(2)])

model.train()
outputs = model(input_values=batch["input_values"], labels=batch["labels"])
print("Manual forward pass loss:", outputs.loss.item())



Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Manual forward pass loss: nan


In [137]:
# more printing for debugging

print("First example labels (after padding):", batch["labels"][0])
print("Are all labels -100?", torch.all(batch["labels"][0] == -100).item())
print("Number of non -100 tokens:", (batch["labels"][0] != -100).sum().item())


First example labels (after padding): tensor([3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3,
        4, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3,
        3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3,
        3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3])
Are all labels -100? False
Number of non -100 tokens: 118


In [138]:
print("Audio min/max:", batch["input_values"][0].min().item(), batch["input_values"][0].max().item())


Audio min/max: -2.5152454376220703 20.475688934326172


In [89]:
print("Tokenizer vocab size:", processor.tokenizer.vocab_size)
print("Max label ID in batch:", max([token_id for seq in batch["labels"] for token_id in seq if token_id != -100]))


Tokenizer vocab size: 32
Max label ID in batch: tensor(27)


In [83]:
# Training arguments (same across folds)
training_args = TrainingArguments(
    output_dir="./hubert-finetuned-lyrics",
    per_device_train_batch_size=1,
    eval_strategy="no",  # manual evaluation
    num_train_epochs=10,
    logging_steps=10,
    save_steps=500,
    fp16=torch.cuda.is_available(),
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2,
    gradient_checkpointing=True,
    gradient_accumulation_steps=8,
    report_to="none",
    remove_unused_columns=False,
)

# Training in folds

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset1)):
    print(f"\n Fold {fold+1}/{k}")
    train_dataset = dataset1.select(train_idx.tolist())
    val_dataset = dataset1.select(val_idx.tolist())

    # Loading model and freezing layers
    model = model_class.from_pretrained(base_model_path)
    model.freeze_feature_extractor()

    for name, param in model.named_parameters():
        if "encoder.layers." in name:
            param.requires_grad = False

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=processor,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    eval_result = trainer.evaluate()
    wer = eval_result.get("eval_wer", None)
    print(f" Fold {fold+1} WER: {wer:.4f}" if wer is not None else "No WER computed")
    wer_scores.append(wer)

# You see how the loss is zeor because the forward pass is not working correctly


 Fold 1/5


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,0.0
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0


KeyboardInterrupt: 