In [None]:
import os
import pandas as pd
import pickle

# Set directory for audio chunks and load csv with lyrics
chunks_dir = "C:\\Users\\dacla\\Documents\\DALI-chunks"
df_chunks = pd.read_csv("lyrics-chunks.csv")

df_chunks.head()

In [84]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

# Split by whitespace
tokenizer.pre_tokenizer = Whitespace()

# Byte-pair encoding
trainer = BpeTrainer(vocab_size=3000, min_frequency=5, special_tokens=["[PAD]", "[UNK]", "|"])

# Text body from the DALI lyrics database
file_path = "C:\\Users\\dacla\\Documents\\auto-censoring-local\\corpus.txt"

# Train the tokenizer
tokenizer.train([file_path], trainer)

# And save output
token_dir = "C:\\Users\\dacla\\Documents\\auto-censoring-local\\tokenizers"
tokenizer.save(f"{token_dir}\\tokenizer.json")
tokenizer.model.save(token_dir)

['C:\\Users\\dacla\\Documents\\auto-censoring-local\\tokenizers\\vocab.json',
 'C:\\Users\\dacla\\Documents\\auto-censoring-local\\tokenizers\\merges.txt']

In [85]:
# Test the tokenizer
encoded = tokenizer.encode("beans and legumes")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")
print()


print(f"ID for [PAD]: {tokenizer.token_to_id('[PAD]')}")
print(f"ID for [UNK]: {tokenizer.token_to_id('[UNK]')}")
print(f"ID for '|' (space): {tokenizer.token_to_id('|')}")

Tokens: ['be', 'ans', 'and', 'le', 'gu', 'mes']
IDs: [59, 1177, 53, 64, 350, 1227]

ID for [PAD]: 0
ID for [UNK]: 1
ID for '|' (space): 2


In [86]:
from transformers import Wav2Vec2CTCTokenizer

# Path to the files we just saved
vocab_file = ".\\tokenizers\\vocab.json"
merges_file = ".\\tokenizers\\merges.txt"

# Load the trained BPE files into the wav2vec2-specific tokenizer class
custom_tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file=vocab_file,
    merges_file=merges_file,
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|", # Crucial for wav2vec2
)

custom_tokenizer.save_pretrained(".\\tokenizers\\my_wav2vec2_bpe_tokenizer")

('.\\tokenizers\\my_wav2vec2_bpe_tokenizer\\tokenizer_config.json',
 '.\\tokenizers\\my_wav2vec2_bpe_tokenizer\\special_tokens_map.json',
 '.\\tokenizers\\my_wav2vec2_bpe_tokenizer\\vocab.json',
 '.\\tokenizers\\my_wav2vec2_bpe_tokenizer\\added_tokens.json')

In [87]:
test_sentence = 'beans and legumes'

tokens = custom_tokenizer.tokenize(test_sentence)
print("Tokens:", tokens)

encoded = custom_tokenizer(test_sentence).input_ids
print("Encoded IDs:", encoded)

Tokens: ['bea', 'ns', 'and', 'leg', 'um', 'es']
Encoded IDs: [552, 2787, 53, 2836, 364, 135]


In [88]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor

# 1. Load your custom tokenizer
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(".\\tokenizers\\my_wav2vec2_bpe_tokenizer")

# 2. Create a standard feature extractor
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False
)

# 3. Bundle them into a processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Save the processor for easy loading later
processor.save_pretrained("my_wav2vec2_processor")
print("Processor created and saved.")

Processor created and saved.


In [89]:
from transformers import Wav2Vec2ForCTC

model_name = "facebook/wav2vec2-base"
model = Wav2Vec2ForCTC.from_pretrained(model_name)

print(f"Old LM Head: {model.lm_head}")
print(f"Old Vocab Size (from config): {model.config.vocab_size}")

# 3. Manually replace the LM head
# Get the model's hidden size
hidden_size = model.config.hidden_size

# Create a new linear layer with the correct dimensions
new_lm_head = torch.nn.Linear(hidden_size, new_vocab_size)

# Replace the old lm_head with the new one
model.lm_head = new_lm_head

# 4. VERY IMPORTANT: Update the model's config to reflect the new vocab size
model.config.vocab_size = new_vocab_size
model.config.pad_token_id = processor.tokenizer.pad_token_id

print("-" * 20)
print(f"New LM Head: {model.lm_head}")
print(f"New Vocab Size (from config): {model.config.vocab_size}")

# It's a good practice to freeze the feature extractor part of the model
# during the initial phase of fine-tuning.
model.freeze_feature_encoder()

print("Model loaded and output layer resized for the new vocabulary.")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Old LM Head: Linear(in_features=768, out_features=32, bias=True)
Old Vocab Size (from config): 32
--------------------
New LM Head: Linear(in_features=768, out_features=3000, bias=True)
New Vocab Size (from config): 3000
Model loaded and output layer resized for the new vocabulary.
