# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [2]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-hp83nz9u
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-hp83nz9u
  Resolved https://github.com/openai/whisper.git to commit 0a60fcaa9b86748389a656aa013c416030287d47
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting triton==2.0.0 (from openai-whisper==20230918)
  Downloading triton-2.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torch (from openai-whisper==20230918)
  Downloading torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

# Loading the LibriSpeech dataset

The following will load the test-clean split of the LibriSpeech corpus using torchaudio.

In [5]:
! pip install torchaudio

Collecting torchaudio
  Downloading torchaudio-2.0.2-cp311-cp311-manylinux1_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: torchaudio
Successfully installed torchaudio-2.0.2


In [56]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
from ..whisper.audio import pad_or_trim, log_mel_spectrogram
from ..whisper.__init__ import load_model
from ..whisper.decoding import DecodingOptions
import torchaudio

from tqdm.notebook import tqdm
from pydub import AudioSegment

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

ImportError: attempted relative import with no known parent package

In [48]:
sounds = [
    AudioSegment.from_wav('/home/evan/Documents/dataset/IEMOCAP/Session1/whisper_concat_test/Ses01F_impro01_F000.wav'),
    AudioSegment.from_wav('/home/evan/Documents/dataset/IEMOCAP/Session1/whisper_concat_test/Ses01F_impro01_F001.wav'),
    AudioSegment.from_wav('/home/evan/Documents/dataset/IEMOCAP/Session1/whisper_concat_test/Ses01F_impro01_M011.wav'),
    AudioSegment.from_wav('/home/evan/Documents/dataset/IEMOCAP/Session1/whisper_concat_test/Ses01F_impro02_F000.wav'),
    AudioSegment.from_wav('/home/evan/Documents/dataset/IEMOCAP/Session1/whisper_concat_test/Ses01F_impro03_F006.wav'),
    AudioSegment.from_wav('/home/evan/Documents/dataset/IEMOCAP/Session1/whisper_concat_test/Ses01F_impro03_F010.wav'),
]

audio = sounds[0]
for i in range(1,len(sounds)):
    audio = audio.append(sounds[i])

audio.export("/home/evan/Documents/dataset/IEMOCAP/Session1/whisper_concat_test/concat.wav", format="wav")
waveform, sample_rate = torchaudio.load("/home/evan/Documents/dataset/IEMOCAP/Session1/whisper_concat_test/concat.wav")

audio = pad_or_trim(waveform.flatten()).to(DEVICE)
mel = log_mel_spectrogram(audio)
model = load_model("base.en")
options = DecodingOptions(language="en", without_timestamps=True, prompt="You are given 6 speech, please identify their emotions. Answer: neutral, neutral, angry, sad, happy, and")
result = model.decode(mel, options)
print(result.text)

excuse me? Yeah. I don't understand why this is so complicated for people when they get here. It's just a simple form. I just need an ID. Did you get the mail? Did you send my letter? You weren't begging and he realized that the only thing that would make it better was me and says, Bride. Oh, I hadn't even thought about that.


In [5]:
class IEMOCAP(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.IEMOCAP(
            root='/home/evan/Documents/dataset/IEMOCAP',
            sessions = (1,2,3,4,5),
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, _, label, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        
        return (mel, label)

In [24]:
dataset = LibriSpeech("test-clean")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)

# Running inference on the dataset using a base Whisper model

The following will take a few minutes to transcribe all utterances in the dataset.

In [25]:
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 71,825,408 parameters.


In [26]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)

In [30]:
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    print(results)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)
    break

  0%|          | 0/164 [00:00<?, ?it/s]

[DecodingResult(audio_features=tensor([[-1.0576,  0.3557, -1.0889,  ...,  0.4553, -0.7871,  0.0673],
        [-0.2568,  0.0029, -0.1195,  ...,  0.1710, -0.3689,  0.3582],
        [-0.2654,  0.0019,  0.0253,  ...,  0.1702,  0.0403, -0.0301],
        ...,
        [ 0.2607, -0.3406, -0.7217,  ...,  0.8687,  0.0341,  0.5186],
        [ 0.4514, -0.1913, -0.5493,  ...,  0.8955, -0.1149,  0.2500],
        [ 0.5615, -0.6660,  0.3049,  ...,  0.6152, -0.3379,  0.1537]],
       device='cuda:0', dtype=torch.float16), language='en', language_probs=None, tokens=[679, 10719, 612, 561, 307, 20798, 329, 8073, 11, 1210, 2419, 290, 34397, 290, 44379, 18821, 290, 3735, 4517, 1122, 5207, 284, 307, 9717, 992, 503, 287, 6546, 11, 49038, 1068, 15061, 12, 17359, 268, 10746, 13], text='He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flower-faten sauce.', avg_logprob=-0.17884910734076248, no_speech_prob=0.0406485833227634

In [29]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...
5,"The music came nearer, and he recalled the wor...",THE MUSIC CAME NEARER AND HE RECALLED THE WORD...
6,The dull light fell more faintly upon the page...,THE DULL LIGHT FELL MORE FAINTLY UPON THE PAGE...
7,"A cold, lucid indifference reigned in his soul.",A COLD LUCID INDIFFERENCE REIGNED IN HIS SOUL
8,The chaos in which his order extinguished itse...,THE CHAOS IN WHICH HIS ARDOUR EXTINGUISHED ITS...
9,"At most, by an alms given to a beggar whose bl...",AT MOST BY AN ALMS GIVEN TO A BEGGAR WHOSE BLE...


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [9]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [10]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...,he hoped there would be stew for dinner turnip...,he hoped there would be stew for dinner turnip...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM,stuffered into you his belly counseled him,stuff it into you his belly counseled him
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...,after early nightfall the yellow lamps would l...,after early nightfall the yellow lamps would l...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND,hello bertie any good in your mind,hello bertie any good in your mind
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...,number 10 fresh nelly is waiting on you good n...,number 10 fresh nelly is waiting on you good n...
...,...,...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...,0 to shoot my soul is full meaning into future...,0 to shoot my soul is full meaning into future...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...,then i long tried by natural ills received the...,then i long tried by natural ills received the...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...,i love thee freely as men strive for right i l...,i love thee freely as men strive for right i l...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...,i love thee with the passion put to use in my ...,i love thee with the passion put to use in my ...


In [11]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 4.26 %
