In [1]:
import tensorflow as tf

from datasets import load_dataset
from datasets import Audio

from transformers import WhisperProcessor, WhisperFeatureExtractor, TFWhisperForConditionalGeneration, WhisperTokenizer




In [2]:
# Create an account in huggingface, generate a token and paste here:
use_auth_token='XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

In [3]:
# whisper model
lang = "portuguese"
model_name = "openai/whisper-base" # Choose: tiny, base, small, medium, or large

In [4]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name , token=use_auth_token)

In [5]:
tokenizer = WhisperTokenizer.from_pretrained(model_name, language=lang, task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
processor = WhisperProcessor(feature_extractor, tokenizer)

In [7]:
model = TFWhisperForConditionalGeneration.from_pretrained(model_name)




All PyTorch model weights were used when initializing TFWhisperForConditionalGeneration.

All the weights of TFWhisperForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWhisperForConditionalGeneration for predictions without further training.


In [8]:
forced_decoder_ids = processor.get_decoder_prompt_ids(language=lang, task="transcribe")

In [9]:
# Loading dataset
#ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

# https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0
ds = load_dataset("mozilla-foundation/common_voice_11_0", "pt", split="validation", trust_remote_code=True)
# Original sampling in 48000. Must be converted to 16000
ds = ds.cast_column("audio", Audio(sampling_rate=16000))

In [10]:
for i in range(10,20):
    inputs = feature_extractor(
        ds[i]["audio"]["array"], sampling_rate=ds[i]["audio"]["sampling_rate"], return_tensors="tf"
    )
    input_features = inputs.input_features
    
    # Generating Transcription
    generated_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    #transcription = processor.tokenizer.decode(generated_ids[0])
    #print(transcription)
    transcription = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(transcription)

 Viva então, sei para ir buscar o caminhão, que estava no interior da casa.
 Onde é o casa?
 Em septembro, que em tempo trico, que ela se lembrava.
 mais se acorda como lei.
 Este cavalo tem mais de um cavalo de potência.
 Seu amor à vida é o amor de viva.
 Boa vista do Ingram.
 Essa dasche está sempre fazendo.
 Milévia, até a estação de três interessedadores.
 Capitamos intuitivamente os símbolos linguísticos.


In [11]:
model.save('content/tf_whisper_saved')


INFO:tensorflow:Assets written to: content/tf_whisper_saved\assets


INFO:tensorflow:Assets written to: content/tf_whisper_saved\assets


In [12]:
saved_model_dir = 'content/tf_whisper_saved'
tflite_model_path = 'whisper-base-pt.tflite'

# Convert the model
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

In [13]:
# Save the model
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)