In [12]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from datasets import Dataset, Audio

## Load the transformer's model and the whisper processor
In particular, we use a transformer and a Whisper processor that have 2 functions:
- Pre-process the audio inputs (converting them to log-Mel spectrograms for the model)
- Post-process the model outputs (converting them from tokens to text)

In [13]:
processor = AutoProcessor.from_pretrained("openai/whisper-base")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-base")

## Creation of an audio dataset from local files in recordings folder
Despite normally a spectrogram works with a range of values between 40 kHz - 44 kHz, the model accepts only a sampling rate of 16 kHz

In [14]:
audio_dataset = Dataset.from_dict({"audio": ["recordings/Prova.wav", "recordings/prova2.wav"]}).cast_column("audio", Audio(sampling_rate=16000))

my_audio = audio_dataset[1]['audio']

In [15]:
my_audio

{'path': 'recordings/prova2.wav',
 'array': array([-0.00340922, -0.00526658, -0.00676686, ..., -0.00095332,
        -0.00131988,  0.        ], dtype=float32),
 'sampling_rate': 16000}

## Creation of the input features from the audio sample
The input features are passed to the model, which will convert them into token ids.
We're forcing the model to transcribe the audio to italian language.
The decoder, then, will decode the token ids to text.

In [16]:
# processor is a variable, but called as processor() indicates the magic method "__call__" is called
input_features = processor(my_audio['array'], sampling_rate=my_audio['sampling_rate'], return_tensors='pt').input_features

In [17]:
forced_decoder_ids = processor.get_decoder_prompt_ids(language='italian', task='translate')

In [18]:
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

In [19]:
translated_transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
translated_transcription

['<|startoftranscript|><|it|><|translate|><|notimestamps|> The laws are the conditions with which the independent and isolated women unite in society, to live in a continuous state of war and to govern a free and useless freedom of the uncertainty of conserving it. And thus, to sacrifice a part for govern the remaining security and tranquility.<|endoftext|>']

In [20]:
translated_transcription_no_special_tokens = processor.batch_decode(predicted_ids, skip_special_tokens=True)
translated_transcription_no_special_tokens

[' The laws are the conditions with which the independent and isolated women unite in society, to live in a continuous state of war and to govern a free and useless freedom of the uncertainty of conserving it. And thus, to sacrifice a part for govern the remaining security and tranquility.']