# Whisper-tiny on IPU

This notebook demonstrates inference with Whisper-tiny on IPU using FP16.    
The present version of the IPU Whisper implementation runs the encoder and the decoder on IPU.

In [14]:
%load_ext autoreload
%autoreload 2
import numpy as np


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig, WhisperTokenizer

from dataclasses import dataclass
from typing import List

@dataclass
class IPUWhisperConf:
    """A data class to collect IPU-related config parameters"""
    model_spec: str
    layers_per_ipu: List
    pod_type: str

ipu_whisper = {
    "tiny": IPUWhisperConf(model_spec='openai/whisper-tiny.en', layers_per_ipu=[8], pod_type="pod4"),
    # Larger sizes will become available in due course
}
model_size = "tiny"
iwc = ipu_whisper[model_size]


Max output sequence length 
- default is 448, but I couldn't fit more than 1 batch
- 384 tokens in 30 seconds should still be ok for English, which is ~170 workds / minute for a fast speaker, and lets us fit batch size 2.

In [16]:
# max_length = 448
max_length = 384

In [17]:
import os
from pathlib import Path
pod_type = os.getenv("GRAPHCORE_POD_TYPE", iwc.pod_type)
executable_cache_dir = os.getenv("POPLAR_EXECUTABLE_CACHE_DIR", "/tmp/whisper_exe_cache/") + "whisper_inference"

In [18]:
# os.environ["PVTI_OPTIONS"]=r'{"enable":"true", "directory":"/localdata/paolot/profiles/minimal"}'
# os.environ["POPLAR_ENGINE_OPTIONS"] = f'{{"autoReport.all":"true", "debug.allowOutOfMemory": "true", "autoReport.directory":"profiles"}}'

Source file: 

wget https://upload.wikimedia.org/wikipedia/commons/3/3d/Barack_Obama_inauguration_speech_2009.ogg
ffmpeg -y -i Barack_Obama_inauguration_speech_2009.ogg -ar 16000 -ac 1 BarackObama.wav



In [19]:
# Instantiate processor and model
from optimum.graphcore import IPUConfig
from optimum.graphcore.modeling_utils import to_pipelined

processor = WhisperProcessor.from_pretrained(iwc.model_spec)
model = WhisperForConditionalGeneration.from_pretrained(iwc.model_spec)
ipu_config = IPUConfig(
    executable_cache_dir=executable_cache_dir,
    layers_per_ipu=iwc.layers_per_ipu, 
    matmul_proportion=0.1)
pipelined_model = to_pipelined(model, ipu_config)
pipelined_model = pipelined_model.parallelize().half()

In [20]:
import soundfile as sf
wav_array, sampling_rate = sf.read('BarackObama.wav')
wav_array = np.array(wav_array, dtype=np.float32)

In [21]:
full_length_s = len(wav_array) / sampling_rate
num_samples_30_sec = int(30.0 * sampling_rate)
num_samples_to_pad = -len(wav_array) % num_samples_30_sec
padded_wav_array = np.pad(wav_array, (0, num_samples_to_pad))
reshaped_wav_array = padded_wav_array.reshape((len(padded_wav_array)//num_samples_30_sec, num_samples_30_sec) )


In [22]:
print(f"{reshaped_wav_array.shape[0]} transcriptions of {reshaped_wav_array.shape[1]/sampling_rate} seconds, total duration {len(padded_wav_array)/sampling_rate} seconds")


38 transcriptions of 30.0 seconds, total duration 1140.0 seconds


## Batch size

Still experimental. The batching is done manually via ugly for() loops.

In [23]:
batch_size = 2

# Separate pre-/post- processing

### Preprocessing


In [24]:
%%time
all_input_features = []
for k in range(0, reshaped_wav_array.shape[0], batch_size):
    wav_data = [reshaped_wav_array[k+i,:] for i in range(batch_size)]
    all_input_features.append( processor(wav_data, return_tensors='pt',sampling_rate=16000).input_features.half() )


CPU times: user 1min 4s, sys: 2min 29s, total: 3min 33s
Wall time: 3.05 s


### Inference

In [31]:
%%time
all_sample_outputs = []
for input_features in all_input_features:
    sample_output = pipelined_model.generate(input_features, max_length=max_length, min_length=3)
    all_sample_outputs.append(sample_output)


CPU times: user 8min 3s, sys: 0 ns, total: 8min 3s
Wall time: 8.3 s


### Post-processing

In [32]:
%%time
all_transcriptions = []
for sample_output in all_sample_outputs:
    transcription = processor.batch_decode(sample_output, skip_special_tokens=False)
    all_transcriptions.append(transcription)

CPU times: user 15.7 ms, sys: 0 ns, total: 15.7 ms
Wall time: 14.8 ms


In [33]:
all_transcriptions

[['<|startoftranscript|><|notimestamps|> Thank you. Thank you. My fellow citizens. I stand here today<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
  '<|startoftranscript|><|notimestamps|> Humboldt by the task of force. Grateful for the trust you bestowed, mindful of the sacrifices born by our ancestors. I thank President Bush for his service to our nation. As well as the generosity and cooperation he has shown throughout this transition.<|endoftext|>'],
 ['<|startoftranscript|><|notimestamps|> 44 Americans have now taken the presidential oath.

# Batching

In [34]:
%%time
all_transcriptions = []
for k in range(0, reshaped_wav_array.shape[0], batch_size):
    wav_data = [reshaped_wav_array[k+i,:] for i in range(batch_size)]
    input_features = processor(wav_data, return_tensors='pt',sampling_rate=16000).input_features.half()
    sample_output = pipelined_model.generate(input_features, max_length=max_length, min_length=3)
    transcription = processor.batch_decode(sample_output, skip_special_tokens=False)
    all_transcriptions.append(transcription)

CPU times: user 9min 43s, sys: 2min 39s, total: 12min 22s
Wall time: 12 s


In [35]:
len(all_transcriptions[0]), len(all_transcriptions)

(2, 19)

# Pipelines

Trying HF inference pipelines, in particular becuase it implements a 30-sec overlapping window (https://huggingface.co/openai/whisper-medium.en#long-form-transcription).

Currently doesn't work.

In [None]:
from optimum.graphcore import pipeline
# NOTE: you'll need ffpmeg installed on the system (apt install ffmpeg)
!pip install ffmpeg

In [None]:
processor.sampling_rate=16000
feature_extractor._processor_class

In [None]:
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en", language='english')
pp = pipeline("automatic-speech-recognition", 
              model=pipelined_model, 
              config=ipu_config)

In [None]:
%%time
pp('Barack_Obama_inauguration_speech_2009.ogg', 
   chunk_length_s=30, 
   stride_length_s=[6,0], 
   batch_size=32)

In [None]:
PipelinedWhisperForConditionalGeneration

In [None]:
from optimum.graphcore.models import whisper


In [None]:
whisper.PipelinedWhisperForConditionalGeneration