In [225]:
import pandas as pd
from datasets import Dataset, load_dataset, DatasetDict
import os
import librosa
import numpy as np
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from datasets import Audio
from collections import OrderedDict
import evaluate
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

In [208]:
dataset = load_dataset("audiofolder", data_dir="./audio_folder/", split="train")
train_test = dataset.train_test_split(test_size=0.22)
common_voice = DatasetDict(train_test)
print(common_voice)

Found cached dataset audiofolder (C:/Users/Bilal/.cache/huggingface/datasets/audiofolder/default-218721f8d98bdfb8/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 11
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4
    })
})


In [209]:
for i in range(len(common_voice["train"])):
    print("Audio: ",common_voice["train"][i]['audio']['path'])
    print("Sentence: ",common_voice["train"][i]['sentence'])
print("\n\n")
for i in range(len(common_voice["test"])):
     print("Audio: ",common_voice["test"][i]['audio']['path'])
     print("Sentence: ",common_voice["test"][i]['sentence'])

Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\the.mp3
Sentence:  the
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\the2.mp3
Sentence:  the
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\nent.mp3
Sentence:  nent
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\an.mp3
Sentence:  an
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\e.mp3
Sentence:  e
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\ti.mp3
Sentence:  ti
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\adult.mp3
Sentence:  adult
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\ant.mp3
Sentence:  ant
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\wa.mp3
Sentence:  wa
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\tica.mp3
Sentence:  tica
Audio:  C:\Users\Bilal\Desktop\whisper_fine_tuning\audio_folder\antarctica.mp3
Sentence:  antarctica



Audio:  C:\Users\Bilal\Desk

In [210]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="english", task="transcribe")
tokenizer.save_pretrained("tokenizer/")
processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="english", task="transcribe")

In [211]:
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out s>pecial: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 the
Decoded w/ special:    <|startoftranscript|><|en|><|transcribe|><|notimestamps|>the<|endoftext|>
Decoded w/out s>pecial: the
Are equal:             True


In [212]:
print(common_voice["train"][0]['audio'])

{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\the.mp3', 'array': array([ 0.00000000e+00,  8.60226387e-13, -9.18156610e-13, ...,
       -2.04577600e-06,  1.10641349e-06,  5.81748964e-07]), 'sampling_rate': 24000}


In [213]:
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16_000))

In [214]:
print(common_voice["train"][0])

{'audio': {'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\the.mp3', 'array': array([-1.56319402e-13,  2.84217094e-13,  1.13686838e-13, ...,
       -7.70985935e-07, -1.47377091e-06,  1.06200332e-06]), 'sampling_rate': 16000}, 'sentence': 'the'}


In [215]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]
    print(audio)
    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [216]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"])

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\the.mp3', 'array': array([-1.56319402e-13,  2.84217094e-13,  1.13686838e-13, ...,
       -7.70985935e-07, -1.47377091e-06,  1.06200332e-06]), 'sampling_rate': 16000}
{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\the2.mp3', 'array': array([ 0.00000000e+00, -1.09139364e-11, -9.54969437e-12, ...,
        2.51437086e-05, -6.18215199e-06, -7.33240631e-06]), 'sampling_rate': 16000}
{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\nent.mp3', 'array': array([-9.09494702e-12,  1.00044417e-11,  2.91038305e-11, ...,
       -1.11434929e-05, -2.98958621e-05, -3.25626497e-05]), 'sampling_rate': 16000}
{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\an.mp3', 'array': array([-3.63797881e-12,  4.54747351e-13,  2.95585778e-12, ...,
        2.35709595e-05,  3.83985825e-05,  1.86654361e-05]), 'sampling_rate': 16000}
{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fi

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\con1.mp3', 'array': array([ 1.88720151e-11, -1.90993887e-11,  7.27595761e-12, ...,
       -8.10189249e-10, -3.25185567e-09, -1.59863589e-09]), 'sampling_rate': 16000}
{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\re.mp3', 'array': array([ 2.72848411e-12, -1.90993887e-11, -3.27418093e-11, ...,
        3.24988036e-10,  3.03941761e-09,  7.12759629e-10]), 'sampling_rate': 16000}
{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\a.mp3', 'array': array([ 3.63797881e-12,  1.04591891e-11, -5.45696821e-12, ...,
        9.17507350e-08,  2.17551133e-08,  2.34365416e-08]), 'sampling_rate': 16000}
{'path': 'C:\\Users\\Bilal\\Desktop\\whisper_fine_tuning\\audio_folder\\tar.mp3', 'array': array([-2.27373675e-12, -4.54747351e-13,  6.82121026e-12, ...,
       -1.60617856e-05, -1.02037557e-05, -1.72864111e-05]), 'sampling_rate': 16000}


In [217]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [218]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


In [219]:
metric = evaluate.load("wer")

In [220]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [221]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

In [222]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [223]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-base-pron",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=100,
    fp16=False,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)


In [226]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [227]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=100, training_loss=2.3305477619171144, metrics={'train_runtime': 7280.8357, 'train_samples_per_second': 0.22, 'train_steps_per_second': 0.014, 'total_flos': 7.1345995776e+16, 'train_loss': 2.3305477619171144, 'epoch': 100.0})

In [228]:
trainer.save_model()

In [243]:
from transformers import pipeline
import gradio as gr

tokenizer = WhisperTokenizer.from_pretrained('./tokenizer/', language="english", task="transcribe")
# model = WhisperForConditionalGeneration.from_pretrained("./whisper-base-pron")

pipe = pipeline("automatic-speech-recognition", model="./whisper-base-pron", tokenizer=tokenizer)

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

print(pipe)

<transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline object at 0x000001C458C69850>


In [244]:
iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text",
    title="Whisper Small Pronunciation",
    description="Realtime demo for English speech recognition using a fine-tuned Whisper base model.",
)

iface.launch()

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.




