Ideas ----
- To check for which songs are edited in the youtube downloads, download youtube transcription ctrl+f for curse words or "[ ___ ]"
- Order training set by size to minimize padding tokens?


In [5]:
import os
import pandas as pd
import pickle

# Set directory for audio chunks and load csv with lyrics
chunks_dir = "C:\\Users\\dacla\\Documents\\DALI-chunks"
df_chunks = pd.read_csv("lyrics-chunks-train.csv")

df_chunks.head()

Unnamed: 0,file,transcript,file-wav
0,ff3c695eb32e4197924e7786e8a7812f-7.mp3,all that i want is stillness of heart so i can...,ff3c695eb32e4197924e7786e8a7812f-7.wav
1,b3639d60a49e45578a201d910f36c44c-5.mp3,the first time to really feel alive the first ...,b3639d60a49e45578a201d910f36c44c-5.wav
2,81275468fa124185a222bf037764e925-1.mp3,a new day is rising the queen is laughing stil...,81275468fa124185a222bf037764e925-1.wav
3,0ea248a9588641749edeae319b6ed3ac-0.mp3,getting edgy all the time someone around me ju...,0ea248a9588641749edeae319b6ed3ac-0.wav
4,75ad1213a743497185a0b13cb11d4a37-4.mp3,therell never be a moment ill regret ive loved...,75ad1213a743497185a0b13cb11d4a37-4.wav


Create tokenizer

In [6]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

# Split by whitespace
tokenizer.pre_tokenizer = Whitespace()

# Byte-pair encoding
trainer = BpeTrainer(vocab_size=1000, min_frequency=5, special_tokens=["[PAD]", "[UNK]", "|"])

# Text body from the DALI lyrics database
file_path = "C:\\Users\\dacla\\Documents\\auto-censoring-local\\corpus.txt"

# Train the tokenizer
tokenizer.train([file_path], trainer)

# And save output
token_dir = "C:\\Users\\dacla\\Documents\\auto-censoring-local\\tokenizers"
tokenizer.save(f"{token_dir}\\tokenizer.json")
tokenizer.model.save(token_dir)

['C:\\Users\\dacla\\Documents\\auto-censoring-local\\tokenizers\\vocab.json',
 'C:\\Users\\dacla\\Documents\\auto-censoring-local\\tokenizers\\merges.txt']

In [7]:
# Test the tokenizer
encoded = tokenizer.encode("beans and legumes")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")
print()


print(f"ID for [PAD]: {tokenizer.token_to_id('[PAD]')}")
print(f"ID for [UNK]: {tokenizer.token_to_id('[UNK]')}")
print(f"ID for '|' (space): {tokenizer.token_to_id('|')}")

Tokens: ['be', 'an', 's', 'and', 'le', 'gu', 'me', 's']
IDs: [59, 42, 31, 53, 64, 350, 47, 31]

ID for [PAD]: 0
ID for [UNK]: 1
ID for '|' (space): 2


In [8]:
from transformers import Wav2Vec2CTCTokenizer

# Path to the files we just saved
vocab_file = ".\\tokenizers\\vocab.json"
merges_file = ".\\tokenizers\\merges.txt"

# Load the trained BPE files into the wav2vec2-specific tokenizer class
custom_tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file=vocab_file,
    merges_file=merges_file,
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|", # Crucial for wav2vec2
)

custom_tokenizer.save_pretrained(".\\tokenizers\\my_wav2vec2_bpe_tokenizer")

('.\\tokenizers\\my_wav2vec2_bpe_tokenizer\\tokenizer_config.json',
 '.\\tokenizers\\my_wav2vec2_bpe_tokenizer\\special_tokens_map.json',
 '.\\tokenizers\\my_wav2vec2_bpe_tokenizer\\vocab.json',
 '.\\tokenizers\\my_wav2vec2_bpe_tokenizer\\added_tokens.json')

In [9]:
test_sentence = 'beans and legumes'

tokens = custom_tokenizer.tokenize(test_sentence)
print("Tokens:", tokens)

encoded = custom_tokenizer(test_sentence).input_ids
print("Encoded IDs:", encoded)

Tokens: ['bea', 'n', 's', 'and', 'le', 'gu', 'me', 's']
Encoded IDs: [552, 26, 31, 53, 64, 350, 47, 31]


Create Processor using custom tokenizer

In [10]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor

# 1. Load your custom tokenizer
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(".\\tokenizers\\my_wav2vec2_bpe_tokenizer")

# 2. Create a standard feature extractor
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False
)

# 3. Bundle them into a processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Save the processor for easy loading later
processor.save_pretrained("my_wav2vec2_processor")
print("Processor created and saved.")

Processor created and saved.


Prepare dataset

In [11]:
# metadata-wav has *only* those files which successfully converted to wav 16kHz mono
data = pd.read_csv('metadata-wav.csv')
data.head()

Unnamed: 0,transcript,file-wav
0,life is a moment in space when the dream is go...,001940b614eb43f4a0c826d49a67d66d-0.wav
1,i kiss the morning goodbye butdown inside you ...,001940b614eb43f4a0c826d49a67d66d-1.wav
2,the road is narrow and long when eyes meet eye...,001940b614eb43f4a0c826d49a67d66d-2.wav
3,i turn away from the wall i stumble and fall b...,001940b614eb43f4a0c826d49a67d66d-3.wav
4,i am a woman in love and id do anything to get...,001940b614eb43f4a0c826d49a67d66d-4.wav


In [None]:
from datasets import load_dataset, Audio, DatasetDict

dataset_path = "C:\\Users\\dacla\\Documents\\DALI-chunks-wav"

full_dataset = load_dataset("csv", data_files="metadata-wav.csv", split='train')
print("Full dataset", full_dataset)

# Make a train/test split at this point !
split_dataset = full_dataset.train_test_split(test_size=0.2, shuffle=True, seed=555)
print("\nSplit dataset", split_dataset)

Dataset({
    features: ['transcript', 'file-wav'],
    num_rows: 29656
})
DatasetDict({
    train: Dataset({
        features: ['transcript', 'file-wav'],
        num_rows: 23724
    })
    test: Dataset({
        features: ['transcript', 'file-wav'],
        num_rows: 5932
    })
})


In [206]:
import librosa # for faster comuputation?

def prepare_dataset(batch):
        audio_paths = [f"{dataset_path}\\{fname}" for fname in batch['file-wav']]
        audio_arrays = [librosa.load(path, sr=16000)[0] for path in audio_paths]

        # Needed for custom processor
        model_inputs = processor.feature_extractor(audio_arrays, sampling_rate=16000, padding="longest", return_tensors="pt")
        batch["input_values"] = model_inputs.input_values

        labels = processor.tokenizer(batch["transcript"], padding="longest").input_ids
        
        # CTC loss ignores labels with value -100
        batch["labels"] = [[label if label != processor.tokenizer.pad_token_id else -100 for label in T] for T in labels]
        return batch

prepared_dataset = split_dataset.map(prepare_dataset, 
                                     batched=True,
                                     batch_size=8, # 8 might be too big...check VRAM usage
                                     remove_columns=split_dataset['train'].column_names)

print(prepared_dataset)

Map:   0%|          | 0/23724 [00:00<?, ? examples/s]

Map:   0%|          | 0/5932 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 23724
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 5932
    })
})


In [None]:
# Save dataset to disc
prepared_dataset.save_to_disk('dataset_prepared')

Saving the dataset (0/120 shards):   0%|          | 0/23724 [00:00<?, ? examples/s]

Saving the dataset (0/30 shards):   0%|          | 0/5932 [00:00<?, ? examples/s]

Prepare for training


In [13]:
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

# --- Data Collator ---
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they have to be of different lengths
        # and need different padding methods.
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        
        labels_batch = self.processor.tokenizer.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [14]:
processed_dataset_path = "C:\\Users\\dacla\\Documents\\auto-censoring-local\\dataset_prepared"
model_checkpoint = "facebook/wav2vec2-base-960h"

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    # Decode predicted IDs to text
    pred_logits = pred.predictions
    pred_ids = torch.argmax(torch.from_numpy(pred_logits), dim=-1)
    
    # Decode true labels
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

Create model and rewrite classification layer


In [None]:
from transformers import Wav2Vec2ForCTC

new_vocab_size = processor.tokenizer.vocab_size

model_name = "facebook/wav2vec2-base-960h"
model = Wav2Vec2ForCTC.from_pretrained(model_name, ctc_loss_reduction='mean', pad_token_id=processor.tokenizer.pad_token_id)

print(f"Old LM Head: {model.lm_head}")
print(f"Old Vocab Size (from config): {model.config.vocab_size}")

### Manually replace the LM head
# Get the model's hidden size
hidden_size = model.config.hidden_size

# Create a new linear layer with the correct dimensions
new_lm_head = torch.nn.Linear(hidden_size, new_vocab_size)

# Replace the old lm_head with the new one
model.lm_head = new_lm_head

# Update the model's config to reflect the new vocab size
model.config.vocab_size = new_vocab_size
model.config.pad_token_id = processor.tokenizer.pad_token_id

print("-" * 20)
print(f"New LM Head: {model.lm_head}")
print(f"New Vocab Size (from config): {model.config.vocab_size}")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Old LM Head: Linear(in_features=768, out_features=32, bias=True)
Old Vocab Size (from config): 32
--------------------
New LM Head: Linear(in_features=768, out_features=1000, bias=True)
New Vocab Size (from config): 1000


In [17]:
# send to the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# It's a good practice to freeze the feature extractor part of the model
model.freeze_feature_encoder()

print(f'Model {model_name} loaded on {device}')

Model facebook/wav2vec2-base-960h loaded on cuda


Downsample the dataset for testing the training loops


In [18]:
from datasets import load_from_disk

sample_percentage = 0.01 

# Load full prepared dataset
prepared_dataset_path = 'dataset_prepared'
prepared_datasets = load_from_disk(prepared_dataset_path)
print("--- Full Prepared Dataset ---")
print(prepared_datasets)

# Sample 1% from the training set
train_split = prepared_datasets["train"]
sampled_train_split = train_split.train_test_split(train_size=sample_percentage, shuffle=True, seed=555)['train'] # We only want the 'train' part of this new split

test_split = prepared_datasets["test"]
sampled_test_split = test_split.train_test_split(train_size=sample_percentage, shuffle=True, seed=555)['train'] 

# Overwrite the original splits with the sampled splits
prepared_datasets['train'] = sampled_train_split
prepared_datasets['test'] = sampled_test_split

print(f"\n--- Sampled ({sample_percentage*100}%) Dataset ---")
print(prepared_datasets)

# Now, use this smaller `prepared_datasets` object for the rest of your script
# (creating DataLoaders, etc.)

Loading dataset from disk:   0%|          | 0/120 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/30 [00:00<?, ?it/s]

--- Full Prepared Dataset ---
DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 23724
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 5932
    })
})

--- Sampled (1.0%) Dataset ---
DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 237
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 59
    })
})


Training parameters

In [19]:
from torch.utils.data import DataLoader
from transformers import get_scheduler
from torch.optim import AdamW

# Training parameters
learning_rate = .001
train_batch_size = 2
eval_batch_size = 4

# Defined train and test DLs
train_dataloader = DataLoader(
    prepared_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=train_batch_size
)

eval_dataloader = DataLoader(
    prepared_datasets["test"],
    collate_fn=data_collator,
    batch_size=eval_batch_size
)

optimizer = AdamW(model.parameters(), lr=learning_rate)
## LR scheduler...

Main training loop. Needs more automation for saving best models, etc. But I'm trying to just get it to do anything at the moment


In [None]:
from tqdm import tqdm 

num_epochs = 30 

for epoch in range(num_epochs):
    # --- Training Phase ---
    model.train()
    for batch in tqdm(train_dataloader):
        # Move batch to the correct device
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    # --- Evaluation Phase ---
    model.eval()
    all_predictions = []
    all_labels = []
    
    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)
        
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predicted_ids.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())
        
    # Compute WER metric
    metric_result = compute_metrics(pred_ids=all_predictions, label_ids=all_labels)
    print(f"Epoch {epoch+1} WER: {metric_result['wer']:.4f}")



  9%|▉         | 11/119 [00:09<01:26,  1.25it/s]

-----------------------------

OLD

In [232]:
from transformers import get_scheduler
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1000, # A common choice
    num_training_steps=num_training_steps
)

# --- Gradient Scaler for Mixed Precision ---
scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


Main training loop

In [233]:
# --- The Training Loop ---
progress_bar = tqdm(range(num_training_steps))
best_wer = float('inf')
output_dir = ".\\wav2vec2-custom-loop-best-model"

for epoch in range(num_epochs):
    # --- Training Phase ---
    model.train()
    for batch in train_dataloader:
        # Move batch to the correct device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Use autocast for mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(**batch)
            loss = outputs.loss
        
        # Backward pass
        scaler.scale(loss).backward()
        
        # Optimizer step
        lr_scheduler.step()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        
        progress_bar.update(1)
        progress_bar.set_description(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    # --- Evaluation Phase ---
    model.eval()
    all_predictions = []
    all_labels = []

    print(f"\n--- Running Evaluation for Epoch {epoch+1} ---")
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)
        
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predicted_ids.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())
        
    # Compute WER metric
    metric_result = compute_metrics(pred_ids=all_predictions, label_ids=all_labels)
    current_wer = metric_result['wer']
    print(f"Epoch {epoch+1} WER: {current_wer:.4f}")

    # --- Save the Best Model ---
    if current_wer < best_wer:
        best_wer = current_wer
        print(f"New best WER: {best_wer:.4f}. Saving model to {output_dir}")
        model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)

print("\n--- Training Finished ---")
print(f"Best WER achieved: {best_wer:.4f}")

Epoch 1, Loss: nan:   0%|          | 6/44490 [19:54:07<147553:46:06, 11941.23s/it]
  with torch.cuda.amp.autocast():


KeyboardInterrupt: 