In [6]:
import torch
from transformers import pipeline
from datasets import load_dataset
from pydub import AudioSegment

import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
  "automatic-speech-recognition",
  model="openai/whisper-small",
  chunk_length_s=30,
  device=device,
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
import json

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# sample = ds[0]["audio"]
# print(sample)

audio = AudioSegment.from_file("media/audio/24/CAMERA.wav")

# Ensure the audio is in the correct format (16kHz, mono)
audio = audio.set_frame_rate(16000).set_channels(1)

# Convert to numpy array
audio_array = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0

# prediction = pipe(audio_array.copy(), batch_size=16)["text"]
# print(prediction)

# we can also return timestamps for the predictions
prediction_with_time = pipe(audio_array.copy(), batch_size=16, return_timestamps=True)["chunks"]
print(prediction_with_time)

with open("temp_results.json", "w") as writer:
    json.dump({"data": prediction_with_time}, writer, indent=4)

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


[{'timestamp': (0.0, 25.28), 'text': " Hello, good morning. My name is Nisala Chintan Perera and my matrilation number is 92124836. Passport. It's here."}, {'timestamp': (33.44, 38.44), 'text': " Right, so let's start."}, {'timestamp': (39.76, 43.08), 'text': ' Okay, good morning everyone.'}, {'timestamp': (43.08, 50.44), 'text': ' And thank you first of all all I want to thank you all to give me'}, {'timestamp': (50.44, 51.44), 'text': ' this time.'}, {'timestamp': (51.44, 58.84), 'text': ' I know you are busy person but thank you for giving me this time.'}, {'timestamp': (58.84, 60.72), 'text': ' I am Nisal Perra as you know.'}, {'timestamp': (60.72, 65.0), 'text': ' I am the co-founder and CTO of the company,'}, {'timestamp': (65.04, 66.88), 'text': ' Jalat Marketplace.'}, {'timestamp': (66.88, 71.88), 'text': ' So we are actually here to discuss about the transition'}, {'timestamp': (73.24, 78.24), 'text': ' from our current system to a digitalized system.'}, {'timestamp': (81.8, 9

In [8]:
txt_generator = pipeline(
    "text2text-generation",
    model="hafidikhsan/happy-transformer-t5-base-grammar-correction-ep-v1",
    device=device
)

# sentences = prediction.split(".")

new_sentence = ""
new_sentences = []
for sentence in prediction_with_time:
    if len(new_sentence) + len(sentence["text"]) < 512:
        new_sentence += sentence["text"]
    else:
        new_sentences.append(new_sentence)
        new_sentence = ""    

corrected_txt = []
for new_txt in new_sentences:
    txt = txt_generator(new_txt, max_new_tokens=512)[0]["generated_text"]

    corrected_txt.append(txt)

In [9]:
print(corrected_txt, sep="\n")

["Hello, good morning. My name is Nisala Chintan Perera and my matrilation number is 92124836. Passport. It's here. Right, so let's start. Okay, good morning everyone. And first of all, I want to thank you all for giving me this time. I know you are busy person but thank you for giving me this time. I am Nisal Perra as you know. I am the co-founder and CTO of the company, Jalat Marketplace. So we are actually here to discuss about the transition from our current system to a digitalized system.", 'Okay so current situation so Jalat marketplace is one of the largest market chains in Sri Lanka so we provide services almost half of the population in Sri Lanka.', "So we have a traditional on-premise shopping where you go to the supermarket and you buy your stuff and we have online shopping where we have an online portal and you can order them and we deliver them to your doorstep and take away shopping. That's actually a unique method of our own. So what we do, we analyze past customers, roy