In [1]:
from huggingface_hub import interpreter_login

interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (osxkeychain,s

In [2]:
from datasets import load_dataset, DatasetDict

In [24]:
import pandas as pd

# Load the dataset
data = pd.read_csv("new_natural_math_form.csv", sep=";")

# Combine the English columns into a single list
# text = [sentence for trio in zip(data["English 1"], data["English 2"], data["English 3"]) for sentence in trio]
text = data["Latex"]
text = [item for item in text for _ in range(3)]

audio_dirs = [f"audios_new/v{i}_f{j}.mp3" for j in range(2, 824) for i in range(1, 4)]

In [29]:
from datasets import Dataset, DatasetDict, Audio

# Assuming english_sentences and english_ids are already defined
data_dict = {"sentence": text, "audio": audio_dirs}
dataset = Dataset.from_dict(data_dict)
dataset = dataset.cast_column("audio", Audio())


In [30]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small"
    # "openai/whisper-small", language="english", task="transcribe"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
from datasets import Audio

sampling_rate = processor.feature_extractor.sampling_rate
# common_voice = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [32]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["sentence"],
    )

    # compute input length of audio sample in seconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    return example

In [35]:
import librosa

def downsample_audio(example):
    audio_array, _ = librosa.load(example["audio"]["path"], sr=16000)
    example["audio"]["array"] = audio_array
    example["audio"]["sampling_rate"] = 16000
    return example

dataset = dataset.map(downsample_audio, num_proc=1)

Map:   0%|          | 0/2466 [00:00<?, ? examples/s]

In [36]:
dataset = dataset.map(
    prepare_dataset, num_proc=1
)

Map:   0%|          | 0/2466 [00:00<?, ? examples/s]

In [37]:
dataset = dataset.train_test_split(test_size=0.3)

In [38]:
dataset.save_to_disk("dataset_new")

Saving the dataset (0/4 shards):   0%|          | 0/1726 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/740 [00:00<?, ? examples/s]

In [39]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [
            {"input_features": feature["input_features"][0]} for feature in features
        ]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [40]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [41]:
import evaluate

metric = evaluate.load("wer")

In [42]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer()


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # compute orthographic wer
    wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)

    # compute normalised WER
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    # filtering step to only evaluate the samples that correspond to non-zero references:
    pred_str_norm = [
        pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0
    ]
    label_str_norm = [
        label_str_norm[i]
        for i in range(len(label_str_norm))
        if len(label_str_norm[i]) > 0
    ]

    wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "wer": wer}

In [43]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [44]:
from functools import partial

# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(
    model.generate, language="sinhalese", task="transcribe", use_cache=True
)

In [45]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-latex",  # name on the HF Hub
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=5e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=50,
    max_steps=500,  # increase to 4000 if you have your own GPU or a Colab paid plan
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [46]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [47]:
trainer.train()

  0%|          | 0/500 [00:00<?, ?it/s]



{'loss': 3.8606, 'grad_norm': 16.635406494140625, 'learning_rate': 2.5e-05, 'epoch': 0.23}
{'loss': 1.1522, 'grad_norm': 13.704885482788086, 'learning_rate': 5e-05, 'epoch': 0.46}
{'loss': 0.7662, 'grad_norm': 7.085634708404541, 'learning_rate': 5e-05, 'epoch': 0.69}
{'loss': 0.2652, 'grad_norm': 5.5286455154418945, 'learning_rate': 5e-05, 'epoch': 0.93}
{'loss': 0.1778, 'grad_norm': 5.558685779571533, 'learning_rate': 5e-05, 'epoch': 1.16}
{'loss': 0.1538, 'grad_norm': 4.917520999908447, 'learning_rate': 5e-05, 'epoch': 1.39}
{'loss': 0.1682, 'grad_norm': 4.406467437744141, 'learning_rate': 5e-05, 'epoch': 1.62}
{'loss': 0.1803, 'grad_norm': 2.1724302768707275, 'learning_rate': 5e-05, 'epoch': 1.85}
{'loss': 0.1121, 'grad_norm': 3.55582857131958, 'learning_rate': 5e-05, 'epoch': 2.08}
{'loss': 0.0836, 'grad_norm': 3.0719375610351562, 'learning_rate': 5e-05, 'epoch': 2.31}
{'loss': 0.1013, 'grad_norm': 4.1854329109191895, 'learning_rate': 5e-05, 'epoch': 2.55}
{'loss': 0.0823, 'grad_no

  0%|          | 0/47 [00:00<?, ?it/s]

Checkpoint destination directory ./whisper-small-latex/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.06396019458770752, 'eval_wer_ortho': 350.8474576271186, 'eval_wer': 493.36235038084874, 'eval_runtime': 895.4581, 'eval_samples_per_second': 0.826, 'eval_steps_per_second': 0.052, 'epoch': 4.63}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


{'train_runtime': 5503.0096, 'train_samples_per_second': 1.454, 'train_steps_per_second': 0.091, 'train_loss': 0.37931860423088076, 'epoch': 4.63}


TrainOutput(global_step=500, training_loss=0.37931860423088076, metrics={'train_runtime': 5503.0096, 'train_samples_per_second': 1.454, 'train_steps_per_second': 0.091, 'train_loss': 0.37931860423088076, 'epoch': 4.63})

In [48]:
trainer.save_model("./saved_model_new")

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618

events.out.tfevents.1710558540.DN0a249034.SUNet.45656.0:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [None]:
from datasets import load_metric

wer_metric = load_metric("wer")

def evaluate_datapoint(trainer, input_data):
    model = trainer.model
    processor = trainer.tokenizer
    model.eval()
    inputs = processor(input_data, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs)
    decoded_prediction = processor.batch_decode(outputs, skip_special_tokens=True)
    return decoded_prediction

# Example usage
input_data = "Your input data here"
print(evaluate_datapoint(trainer, input_data))


In [25]:
kwargs = {
    # "dataset_tags": "mozilla-foundation/common_voice_13_0",
    "dataset": "Latex",  # a 'pretty' name for the training dataset
    "language": "en",
    "model_name": "Whisper Small Latex - Lichu",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-small",
    "tasks": "text-to-speech",
}

trainer.push_to_hub(**kwargs)

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


CommitInfo(commit_url='https://huggingface.co/lichuacu/whisper-small-latex/commit/c6b3495d039e4c666362ab680b3d788f23225630', commit_message='End of training', commit_description='', oid='c6b3495d039e4c666362ab680b3d788f23225630', pr_url=None, pr_revision=None, pr_num=None)

In [49]:
from transformers import pipeline

pipe = pipeline(model=trainer) 
# pipe = pipeline(model="lichuacu/whisper-small-latex") 

RuntimeError: Inferring the task automatically requires to check the hub with a model_id defined as a `str`. <transformers.trainer_seq2seq.Seq2SeqTrainer object at 0x284bda610> is not a valid model_id.

In [16]:
from datasets import load_from_disk

dataset_test = load_from_disk("dataset/test")

In [44]:
for idx in range(len(dataset_test)):
    text = pipe(inputs=dataset_test[idx]["audio"])
    print(f"True: {dataset_test[idx]['sentence']}")
    print(f"Predicted: {text['text']}")


True:  \epsilon \otimes y = y
Predicted:  {\epsilon {\otimes y)] d y ˈy
True: \frac{d}{d \epsilon} g(x) = {g\_prime}(b)
Predicted: {\partial g of x ~~{\epsilon ~~{\partial g of x ~~{\partial b ~~{\partial g of x ~~{\partial g of x ~~{\partial b ~~{\partial g of x ~~{\partial g of x ~~{\partial b ~~{\partial g of x ~~{\partial g of x ~~{\partial b ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\partial g of x ~~{\p

KeyboardInterrupt: 