In [1]:
from faster_whisper import WhisperModel

model_size = "large-v2"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = model.transcribe("iemocap.wav", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

  from .autonotebook import tqdm as notebook_tqdm
Downloading model.bin: 100%|██████████| 3.09G/3.09G [10:14<00:00, 5.03MB/s]


ValueError: This CTranslate2 package was not compiled with CUDA support

In [None]:
import numpy as np
import onnxruntime
from onnxruntime_extensions import get_library_path

audio_file = "audio.mp3"
model = "whisper-tiny-en-all-int8.onnx"
with open(audio_file, "rb") as f:
    audio = np.asarray(list(f.read()), dtype=np.uint8)

inputs = {
    "audio_stream": np.array([audio]),
    "max_length": np.array([30], dtype=np.int32),
    "min_length": np.array([1], dtype=np.int32),
    "num_beams": np.array([5], dtype=np.int32),
    "num_return_sequences": np.array([1], dtype=np.int32),
    "length_penalty": np.array([1.0], dtype=np.float32),
    "repetition_penalty": np.array([1.0], dtype=np.float32),
    "attention_mask": np.zeros((1, 80, 3000), dtype=np.int32),
}

options = onnxruntime.SessionOptions()
options.register_custom_ops_library(get_library_path())
session = onnxruntime.InferenceSession(model, options, providers=["CPUExecutionProvider"])
outputs = session.run(None, inputs)[0]


# Faster Whisper


In [6]:
from faster_whisper import WhisperModel
import time
model_size = "medium.en"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cpu", compute_type="float32")



In [None]:
# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

start = time.time()
segments, info = model.transcribe("iemocap.wav", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée : ", time.time() - start, "s")

In [3]:
from faster_whisper import WhisperModel
import time
model_size = "medium.en"

model = WhisperModel(model_size, device="cpu", compute_type="int8")

start = time.time()
segments, info = model.transcribe("iemocap.wav", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée : ", time.time() - start, "s")

Detected language 'en' with probability 1.000000


KeyboardInterrupt: 

# With the custom library


In [1]:
from custom_faster_whisper import WhisperModel
from time import time 

model_size = "medium.en"

model = WhisperModel(model_size, device="cpu", compute_type="int8")

model_vad = WhisperModel(model_size, vad_activation=True, device="cpu", compute_type="int8")

  from .autonotebook import tqdm as notebook_tqdm


Loading the model ...
tokenizer path :  /Users/hugo/Desktop/Projects/MSERT/STT/models/medium.en/tokenizer.json
Loading the model ...
tokenizer path :  /Users/hugo/Desktop/Projects/MSERT/STT/models/medium.en/tokenizer.json


In [2]:
segments = model.transcribe("iemocap.wav", beam_size=5)

segments_vad = model_vad.transcribe("iemocap.wav", beam_size=5)

None


In [3]:
start = time()
for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée : ", time() - start, "s")

0.6
0.4422985017299652 > 0.6
not skipping
[0.00s -> 5.00s]  Excuse me?
[5.00s -> 10.00s]  Do you have your forms?
[10.00s -> 11.00s]  Yeah.
[11.00s -> 12.00s]  Let me see them.
[12.00s -> 18.00s]  Is there a problem?
[18.00s -> 19.00s]  Who told you to get in this line?
[19.00s -> 20.00s]  You did.
[20.00s -> 21.00s]  No.
[21.00s -> 23.00s]  You were standing at the beginning.
[23.00s -> 24.00s]  You directed me.
[24.00s -> 25.00s]  Okay, but I didn't tell you to get in this line
[25.00s -> 27.00s]  if you're filling out this particular form.
0.6
0.5684831142425537 > 0.6
not skipping
[27.00s -> 29.00s]  Well, what's the problem?
[29.00s -> 30.00s]  This form is a ZX4.
[30.00s -> 31.00s]  Let me change it.
[31.00s -> 33.00s]  You can't...
[33.00s -> 35.00s]  This is not the line for the ZX4.
[35.00s -> 37.00s]  If you're going to fill out the ZX4,
[37.00s -> 39.00s]  you need to have a different form of ID.
[39.00s -> 40.00s]  I'm getting an ID.
[40.00s -> 41.00s]  This is why I'm here.

In [4]:
start = time()
for segment in segments_vad:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée : ", time() - start, "s")

0.2508130669593811 > 0.6
[6.58s -> 8.58s]  Excuse me.
[8.74s -> 20.07s]  Do you have your forms? Yeah. Let me see them. Is there a problem? Who told you to get in this line? You did.
[22.11s -> 27.21s]  You were standing at the beginning, you directed me. Okay, but I didn't tell you to get in this line if you're filling out this particular form.
[27.69s -> 35.33s]  Well, what's the problem? What's the problem? Let me change it. This is not the line for the ZX4.
[35.33s -> 38.73s]  If you're gonna fill out the ZX4, you need to have a different form of ID.
0.009213976562023163 > 0.6
[39.05s -> 46.61s]  I'm getting an ID. This is why I'm here. No, I need another set of ID to prove that this is actually you.
[46.61s -> 48.71s]  How am I supposed to get an ID without an ID?
[49.33s -> 51.37s]  How does the person get an ID in the first place?
[51.37s -> 55.23s]  I don't know, but I need an ID to pass this form along.
[55.23s -> 57.85s]  I can't just send it along without an ID. I'm here to 