# Imports

In [1]:
from pathlib import Path

In [2]:
import librosa

In [30]:
import torch
import torch.nn as nn

In [3]:
from torchaudio.datasets import librispeech

In [39]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration

# Data

In [6]:
audio_path = Path("../data/hello.wav")
audio_path.exists() and audio_path.is_file()

True

In [20]:
waveform, sr = librosa.load(audio_path, sr=16000)

# Load the model, begin training on audio file (to overfit)

In [43]:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

In [44]:
model.train()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

In [45]:
input_features = processor(waveform, return_tensors="pt", sampling_rate=sr).input_features
input_features

tensor([[[-0.1529, -0.1795, -0.2277,  ..., -0.9609, -0.9609, -0.9609],
         [-0.3502,  0.0674,  0.1487,  ..., -0.9609, -0.9609, -0.9609],
         [-0.0949,  0.0898,  0.1487,  ..., -0.9609, -0.9609, -0.9609],
         ...,
         [-0.7625, -0.8396, -0.7656,  ..., -0.9609, -0.9609, -0.9609],
         [-0.9039, -0.8158, -0.7739,  ..., -0.9609, -0.9609, -0.9609],
         [-0.8149, -0.9609, -0.9609,  ..., -0.9609, -0.9609, -0.9609]]])

In [46]:
ground_truth_text = "Hello, my name is Izaak."
labels = processor.tokenizer(ground_truth_text, return_tensors="pt").input_ids
labels

tensor([[50257, 50362, 15496,    11,   616,  1438,   318,   314,  4496,   461,
            13, 50256]])

In [47]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [48]:
num_steps = 15

In [50]:
for step in range(num_steps):
    optimizer.zero_grad()

    outputs = model(input_features=input_features, labels=labels)

    loss = outputs.loss

    loss.backward()
    optimizer.step()

    print(f"Step {step+1}/{num_steps}, Loss: {loss.item():.4f}")

Step 1/8, Loss: 0.4363
Step 2/8, Loss: 0.3064
Step 3/8, Loss: 0.1927
Step 4/8, Loss: 0.1101
Step 5/8, Loss: 0.0752
Step 6/8, Loss: 0.0598
Step 7/8, Loss: 0.0422
Step 8/8, Loss: 0.0270


# Eval model outputs

In [53]:
model.eval();

In [61]:
predicted_ids = model.generate(input_features)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [66]:
output = processor.batch_decode(predicted_ids, skip_special_tokens=True)
output

['Hello, my name is Izaak.']