In [1]:
from datasets import load_dataset,DatasetDict,Dataset,Audio
from transformers import pipeline
import torch

In [4]:
if torch.cuda.is_available():
    device = "cuda:0"
    torch_dtype = torch.float16
else:
    device = "cpu"
    torch_dtype = torch.float32
device

'cuda:0'

In [5]:
dataset = load_dataset('audiofolder', data_dir=r'../dataset')

Resolving data files:   0%|          | 0/112 [00:00<?, ?it/s]

In [6]:
train_data = dataset['train']

In [7]:
train_data = train_data.cast_column("audio", Audio(sampling_rate=16000))

In [8]:
asr = pipeline(
  "automatic-speech-recognition",
  #model="openai/whisper-medium",
  model="../models/whisper-base-finetunian/checkpoint-150",
  return_language="english",
  device=device,
)

Device set to use cuda:0


In [9]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

In [10]:
all_predictions = []

# run streamed inference
for prediction in tqdm(
    asr(
        KeyDataset(train_data, "audio"),
        max_new_tokens=128,
        generate_kwargs={"task": "transcribe", "language": "english"},
        batch_size=2,
    ),
    total=len(train_data),
):
    all_predictions.append(prediction["text"])

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
100%|██████████| 108/108 [00:33<00:00,  3.24it/s]


In [11]:
all_predictions

['Please call Stella.Ask her to bring these things with her from the store. Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob.We also need a small plastic snake and a big toy frog for the kids.She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.',
 'The revised procedure was acclaimed as a long-overdue reform.',
 'The revised procedure was acclaimed as a long-overdue reform.',
 'The courtyard is magnificently decorated.',
 'The courtyard is magnificently decorated.',
 'The Greeks used to imagine that it was a sign from the gods to foretell war or heavy rain.',
 'Their work mirrors the mentality of the psychopath, rootless and irresponsible.',
 'The Norsemen considered the rainbow as a bridge over which the gods passed from earth to their home in the sky.',
 'The Norsemen considered the rainbow as a bridge over which the gods passed from earth to their home in the sky.',
 'Others have 

In [12]:
from evaluate import load
from transformers.models.whisper.english_normalizer import BasicTextNormalizer


In [13]:
def get_metrics(train_data, all_predictions, label_key='transcription'):
    wer_metric = load("wer")

    wer_ortho = 100 * wer_metric.compute(
        references=train_data[label_key], predictions=all_predictions
    )
    
    normalizer = BasicTextNormalizer()

    # compute normalised WER
    all_predictions_norm = [normalizer(pred) for pred in all_predictions]
    all_references_norm = [normalizer(label) for label in train_data[label_key]]

    # filtering step to only evaluate the samples that correspond to non-zero references
    all_predictions_norm = [
        all_predictions_norm[i]
        for i in range(len(all_predictions_norm))
        if len(all_references_norm[i]) > 0
    ]
    all_references_norm = [
        all_references_norm[i]
        for i in range(len(all_references_norm))
        if len(all_references_norm[i]) > 0
    ]

    wer = 100 * wer_metric.compute(
        references=all_references_norm, predictions=all_predictions_norm
    )

    return wer, (100 - wer), wer_ortho, (100 - wer_ortho)

In [12]:
## ORIGINAL
get_metrics(train_data, all_predictions)

(2.932551319648094, 97.0674486803519, 3.869047619047619, 96.13095238095238)

In [14]:
## Finetune2
get_metrics(train_data, all_predictions)

(2.4926686217008798, 97.50733137829911, 3.7202380952380953, 96.2797619047619)

In [12]:
## Whisper-medium
get_metrics(train_data, all_predictions)

(16.78082191780822, 83.21917808219177, 24.783362218370883, 75.21663778162912)

In [None]:
##FINETUNIAN
get_metrics(train_data, all_predictions)

(4.4520547945205475, 95.54794520547945, 6.5857885615251295, 93.41421143847487)

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
common_voice = DatasetDict()
common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_13_0", "en", split="validation", trust_remote_code=True,streaming=True
)

In [16]:
i = 1
subset_common_voice = {}
subset_common_voice['test'] = []
for data in common_voice['test']:
    if i == 350:
        break
    subset_common_voice['test'].append(data)
    i += 1

Reading metadata...: 16372it [00:00, 23376.37it/s]


In [17]:
test_data = subset_common_voice['test']

In [18]:
all_predictions = []

# run streamed inference
for prediction in tqdm(
    asr(
        KeyDataset(test_data, "audio"),
        max_new_tokens=128,
        generate_kwargs={"task": "transcribe", "language": "english"},
        batch_size=2,
    ),
    total=len(test_data),
):
    all_predictions.append(prediction["text"])

100%|██████████| 349/349 [01:43<00:00,  3.36it/s]


In [20]:
test_data = Dataset.from_list(test_data)

In [None]:
##ORIGINAL
get_metrics(test_data, all_predictions, label_key='sentence')

(11.061285500747383, 88.93871449925261, 16.296670030272452, 83.70332996972755)

In [23]:
##FINETUNIAN
get_metrics(test_data, all_predictions, label_key='sentence')

(16.243148978574986, 83.756851021425, 22.04843592330979, 77.9515640766902)

In [21]:
##FINETUNIAN2
get_metrics(test_data, all_predictions, label_key='sentence')

(10.36075036075036, 89.63924963924964, 15.481786133960046, 84.51821386603996)