In [11]:
import chardet
import os
import pandas as pd
import torch
import torchaudio.transforms as T

from datasets import load_dataset, Dataset, Audio
from dataclasses import dataclass
from torch.utils.data import DataLoader
from transformers import TrainerCallback, TrainingArguments, Trainer, WhisperForConditionalGeneration, WhisperProcessor
from typing import Any, Dict, List, Union

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_BRAND = "openai/whisper-tiny"
MODEL_PROCESSOR = WhisperProcessor.from_pretrained(MODEL_BRAND)

ROOT_PATH = os.getcwd()
MODELS_PATH = os.path.join(ROOT_PATH, "models")
LEXICON_PATH = os.path.join(ROOT_PATH, "dataset", "lexicon.txt")
AUDIO_PATH = os.path.join(ROOT_PATH, "dataset", "Channel0", "audio")
SCRIPTS_PATH = os.path.join(ROOT_PATH, "dataset", "Channel0", "scripts")
TRANSCRIPTIONS_PATH = os.path.join(ROOT_PATH, "dataset", "Channel0", "transcriptions.csv")
INPUTS_PATH = os.path.join(ROOT_PATH, "inputs")

def add_new_tokens():
	global MODEL_PROCESSOR
	dataset_vocabulary_set = set()
	with open(LEXICON_PATH, "r", encoding="utf-8") as f:
		for lexicon_line in f:
			p = lexicon_line.strip().split("\t")
			if len(p) < 2:
				continue
			lexicon_word = p[0]
			dataset_vocabulary_set.add(lexicon_word)

	novel_tokens_list = list(dataset_vocabulary_set - set(MODEL_PROCESSOR.tokenizer.get_vocab().keys()))
	if novel_tokens_list:
		print(f"Adding {len(novel_tokens_list)} new tokens")
		MODEL_PROCESSOR.tokenizer.add_tokens(novel_tokens_list)
		MODEL_PROCESSOR.save_pretrained(MODELS_PATH)
		MODEL_PROCESSOR = WhisperProcessor.from_pretrained(MODELS_PATH)
	else:
		print("No new tokens to add")

add_new_tokens()

def detect_encoding(path):
	with open(path, "rb") as f:
		return chardet.detect(f.read(100000))["encoding"]

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
	processor: Any
	def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
		input_features = [{"input_features": feature["input_features"]} for feature in features]
		label_features = [{"input_ids": feature["labels"]} for feature in features]
		batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
		labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt", padding=True)
		labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
		batch["labels"] = labels
		return batch

def write_transcriptions():
	transcriptions_list = []

	for s in os.listdir(SCRIPTS_PATH):
		script_path = os.path.join(SCRIPTS_PATH, s)
		script_encoding = detect_encoding(script_path)
		if not script_encoding:
			continue
		try:
			with open(script_path, "r", encoding=script_encoding) as f:
				script_lines = f.readlines()
		except Exception as e:
			continue
		for script_line in script_lines:
			x = script_line.strip().split("\t")
			if len(x) != 2:
				continue
			identifier, transcript = x
			audio_path = os.path.join(AUDIO_PATH, f"{identifier}.WAV")
			if os.path.exists(audio_path):
				transcriptions_list.append({"audio_path": audio_path, "transcript": transcript})

	df = pd.DataFrame(transcriptions_list)
	df.to_csv(TRANSCRIPTIONS_PATH, index=False)

def load_dataset():
	df = pd.read_csv(TRANSCRIPTIONS_PATH)
	df = df.head(2)  # Remove only after team debugging and testing
	X = Dataset.from_pandas(df)
	X = X.cast_column("audio_path", Audio(sampling_rate=16000))

	time_masking = T.TimeMasking(time_mask_param=80)
	freq_masking = T.FrequencyMasking(freq_mask_param=30)

	def preprocess_batch(batch):
		audio = batch["audio_path"]
		features = MODEL_PROCESSOR(audio["array"], sampling_rate=16000, return_tensors="pt").input_features[0]
		features = time_masking(features)  # Apply time masking
		features = freq_masking(features)  # Apply frequency masking
		batch["input_features"] = features
		tokenized = MODEL_PROCESSOR.tokenizer(batch["transcript"], truncation=True, max_length=448)
		batch["labels"] = tokenized.input_ids
		return batch

	X = X.map(preprocess_batch, remove_columns=["audio_path"])
	X = X.train_test_split(test_size=0.2)
	return X["train"], X["test"]

class LossLoggerCallback(TrainerCallback):
	def __init__(self, log_file):
		self.log_file = log_file
	def on_log(self, args, state, control, logs=None, **kwargs):
		if not logs:
			return
		loss_info = f"Step {state.global_step}: "
		if "loss" in logs:
			loss_info += f"Training Loss = {logs['loss']} "
		if "eval_loss" in logs:
			loss_info += f"Validation Loss = {logs['eval_loss']} "
		if "loss" in logs or "eval_loss" in logs:
			with open(self.log_file, "a") as f:
				f.write(loss_info.strip() + "\n")

def finetune_model(X_train, X_test):
	model = WhisperForConditionalGeneration.from_pretrained(MODEL_BRAND).to(DEVICE)
	model.resize_token_embeddings(len(MODEL_PROCESSOR.tokenizer))
	model.config.use_cache = False
	training_args = TrainingArguments(
		output_dir=MODELS_PATH,
		per_device_train_batch_size=2,
		max_steps=2,  # Replace as needed
		eval_strategy="epoch",
		save_strategy="epoch",
		logging_dir=MODELS_PATH,
		report_to="none",
		logging_strategy="epoch",
	)
	data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=MODEL_PROCESSOR)
	trainer = Trainer(
		model=model,
		args=training_args,
		train_dataset=X_train,
		eval_dataset=X_test,
		data_collator=data_collator,
	)
	loss_logger = LossLoggerCallback(os.path.join(MODELS_PATH, "training_loss.log"))
	trainer.add_callback(loss_logger)
	trainer.train()
	trainer.save_model(MODELS_PATH)

write_transcriptions()
X_train, X_test = load_dataset()
finetune_model(X_train, X_test)

def transcribe_audio():
	model = WhisperForConditionalGeneration.from_pretrained(os.path.join(MODELS_PATH, "checkpoint-2")).to(DEVICE)
	for a in [f for f in os.listdir(INPUTS_PATH) if f.lower().endswith((".wav", ".mp3", ".flac"))]:
		input_audio_path = os.path.join(INPUTS_PATH, a)
		input_audio = Audio(sampling_rate=16000).decode_example({"path": input_audio_path})
		input_audio_features = MODEL_PROCESSOR(
			audio["array"],
			sampling_rate=16000,
			return_tensors="pt"
		).input_features.to(DEVICE)
		with torch.no_grad():
			predicted_token_ids_tensor = model.generate(input_audio_features)
		transcription = MODEL_PROCESSOR.tokenizer.batch_decode(predicted_token_ids_tensor, skip_special_tokens=True)[0]
		print(f"{input_audio_path}: {transcription}")

Adding 54863 new tokens


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,10.8223,7.266441
2,6.8327,7.136598




In [None]:
def in_vocabulary(word):
	tokenized = MODEL_PROCESSOR.tokenizer(word, add_special_tokens=False).input_ids
	return len(tokenized) == 1 and tokenized[0] in MODEL_PROCESSOR.tokenizer.get_vocab().values()

print(in_vocabulary("lah"))