# Whisper Classic

In [7]:
from time import time 

In [10]:
from fast_whisper.utils import decode_audio

In [13]:
import whisper

audio_3 = decode_audio("audio/3sec.wav")
audio_6 = decode_audio("audio/6sec.wav")
audio_10 = decode_audio("audio/10sec.wav")
audio_2min11 = decode_audio("audio/2min11.wav")
#Load the medium.en model form whisper
model = whisper.load_model("medium.en")
#transcribe the 3 sec zudio file 
print(audio_3)

start = time()
transcript_3 = model.transcribe(audio_3, language="English")
print(transcript_3['text'])
print(f"Time taken for 3 sec : {time() - start}")

start = time()
transcript_6 = model.transcribe(audio_6, language="English")
print(transcript_6['text'])
print(f"Time taken for 6 sec : {time() - start}")

start = time()
transcript_10 = model.transcribe(audio_10, language="English")
print(transcript_10['text'])
print(f"Time taken for 10 sec : {time() - start}")

start = time()
transcript_2min11 = model.transcribe(audio_2min11, language="English")
print(transcript_2min11['text'])
print(f"Time taken for 2min11 sec : {time() - start}")

[-0.07757568 -0.07736206 -0.07260132 ... -0.00976562 -0.0178833
 -0.009552  ]




 I appreciate that. Yeah!
Time taken for 3 sec : 2.9791951179504395
 Yeah, you want to see my supervisor? Huh? Yeah, you want to see my supervisor? Fine! I'll be right back!
Time taken for 6 sec : 4.602953910827637
 Yes, but my wallet was stolen. I don't have anything. I don't have any credit cards. I don't have my ID. Don't you have things on file here?
Time taken for 10 sec : 5.680836200714111
 So what's up? What's new? Well, Vegas was awesome. Yeah, I heard. And I got married. Shut up. In Vegas? Yeah, in the old town part. Who'd you marry? Jack! Did he propose to you? Yes, it was very romantic. It was at the slot machines. Oh, real fortune slots? Uh huh, he went big and he realized that the only thing that would make it better was me as his bride. He turned to you and was like, hey, let's get married. And I said, okay. It's really romantic. Yeah, well, you know, because he's leaving the next day. Yeah. But we're going to have a honeymoon cruise. Does that mean you're going to get ci

# With the custom library - Faster-whisper


In [3]:
from fast_whisper.fast_whisper import WhisperModel


model_size = "medium.en"

model = WhisperModel(model_size, device="cuda", compute_type="int8")

model_vad = WhisperModel(model_size, vad_activation=True, device="cuda", compute_type="int8")

Downloading the model ...


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)350ce/tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]
Downloading (…)350ce/vocabulary.txt: 100%|██████████| 422k/422k [00:00<00:00, 5.29MB/s]

Downloading (…)1d2350ce/config.json: 100%|██████████| 2.64k/2.64k [00:00<00:00, 14.3MB/s]
Downloading (…)350ce/tokenizer.json: 100%|██████████| 2.13M/2.13M [00:01<00:00, 1.16MB/s]
Downloading model.bin: 100%|██████████| 1.53G/1.53G [00:52<00:00, 29.1MB/s]


model path :  /home/hugo/Desktop/Project/inference_whisper_benchmark/fast_whisper/models/medium.en
tokenizer path :  /home/hugo/Desktop/Project/inference_whisper_benchmark/fast_whisper/models/medium.en/tokenizer.json
Loading the model ...
tokenizer path :  /home/hugo/Desktop/Project/inference_whisper_benchmark/fast_whisper/models/medium.en/tokenizer.json


### without VAD

In [13]:
! pip freeze > requirements.txt

In [4]:
segments_3 = model.transcribe("audio/3sec.wav", beam_size=5)
segments_6 = model.transcribe("audio/6sec.wav", beam_size=5)
segments_10 = model.transcribe("audio/10sec.wav", beam_size=5)
segments_2min11 = model.transcribe("audio/2min11.wav", beam_size=5)

### with VAD 

In [9]:
segments_3_vad = model_vad.transcribe("audio/3sec.wav", beam_size=5)
segments_6_vad = model_vad.transcribe("audio/6sec.wav", beam_size=5)
segments_10_vad = model_vad.transcribe("audio/10sec.wav", beam_size=5)
segments_2min11_vad = model_vad.transcribe("audio/2min11.wav", beam_size=5)

In [10]:
start = time()
for segment in segments_3:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée 3 sec: ", time() - start, "s")

start = time()
for segment in segments_6:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée 6 sec: ", time() - start, "s")

start = time()
for segment in segments_10:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée 10 sec: ", time() - start, "s")

start = time()
for segment in segments_2min11:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée 2min11: ", time() - start, "s")

: 

In [20]:
start = time()
for segment in segments_3_vad:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée 3 sec: ", time() - start, "s")

start = time()
for segment in segments_6_vad:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée 6 sec: ", time() - start, "s")

start = time()
for segment in segments_10_vad:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée 10 sec: ", time() - start, "s")

start = time()
for segment in segments_2min11_vad:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée 2min11: ", time() - start, "s")


0.07921522110700607 > 0.6
not skipping
[0.00s -> 2.10s]  No, I appreciate that yeah
durée 3 sec:  4.019869089126587 s
0.08820515125989914 > 0.6
not skipping
[0.00s -> 3.60s]  something yeah you want to see my supervisor huh yeah you want to see my
[3.60s -> 6.96s]  supervisor fine I'll be right back
durée 6 sec:  5.423288106918335 s
0.010537789203226566 > 0.6
not skipping
[0.00s -> 5.32s]  Yes, but I my wallet was stolen. I don't have anything. I don't have any credit cards
[5.32s -> 9.04s]  I don't have I don't have my ID. Don't you have things that on file here?
durée 10 sec:  6.141629934310913 s
0.4093220829963684 > 0.6
not skipping
[3.50s -> 5.50s]  So what's up? What's new?
[5.50s -> 7.50s]  Well, Vegas was awesome.
[7.50s -> 9.50s]  Yeah, I heard.
[9.50s -> 11.50s]  And I got married.
[11.50s -> 13.50s]  Shut up. In Vegas?
[13.50s -> 15.50s]  Yeah, in the old town part.
[15.50s -> 17.50s]  Who'd you marry?
[17.50s -> 19.50s]  Jack!
[19.50s -> 21.50s]  Did he propose to you?
[21.5

In [3]:
start = time()
for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée : ", time() - start, "s")

3000
(80, 3000)
30.0
0.6
0.4422985017299652 > 0.6
not skipping
[0.00s -> 5.00s]  Excuse me?
[5.00s -> 10.00s]  Do you have your forms?
[10.00s -> 11.00s]  Yeah.
[11.00s -> 12.00s]  Let me see them.
[12.00s -> 18.00s]  Is there a problem?
[18.00s -> 19.00s]  Who told you to get in this line?
[19.00s -> 20.00s]  You did.
[20.00s -> 21.00s]  No.
[21.00s -> 23.00s]  You were standing at the beginning.
[23.00s -> 24.00s]  You directed me.
[24.00s -> 25.00s]  Okay, but I didn't tell you to get in this line
[25.00s -> 27.00s]  if you're filling out this particular form.
3000
(80, 3000)
30.0
0.6
0.5684831142425537 > 0.6
not skipping
[27.00s -> 29.00s]  Well, what's the problem?
[29.00s -> 30.00s]  This form is a ZX4.
[30.00s -> 31.00s]  Let me change it.
[31.00s -> 33.00s]  You can't...
[33.00s -> 35.00s]  This is not the line for the ZX4.
[35.00s -> 37.00s]  If you're going to fill out the ZX4,
[37.00s -> 39.00s]  you need to have a different form of ID.
[39.00s -> 40.00s]  I'm getting an ID.

In [4]:
start = time()
for segment in segments_vad:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
print("durée : ", time() - start, "s")

0.2508130669593811 > 0.6
[6.58s -> 8.58s]  Excuse me.
[8.74s -> 20.07s]  Do you have your forms? Yeah. Let me see them. Is there a problem? Who told you to get in this line? You did.
[22.11s -> 27.21s]  You were standing at the beginning, you directed me. Okay, but I didn't tell you to get in this line if you're filling out this particular form.
[27.69s -> 35.33s]  Well, what's the problem? What's the problem? Let me change it. This is not the line for the ZX4.
[35.33s -> 38.73s]  If you're gonna fill out the ZX4, you need to have a different form of ID.
0.009213976562023163 > 0.6
[39.05s -> 46.61s]  I'm getting an ID. This is why I'm here. No, I need another set of ID to prove that this is actually you.
[46.61s -> 48.71s]  How am I supposed to get an ID without an ID?
[49.33s -> 51.37s]  How does the person get an ID in the first place?
[51.37s -> 55.23s]  I don't know, but I need an ID to pass this form along.
[55.23s -> 57.85s]  I can't just send it along without an ID. I'm here to 

Time : 
Long file -> 
Small file -> 

# With Whisper.cpp Wrapper

In [19]:
! pip install pywhispercpp

Collecting pywhispercpp
  Obtaining dependency information for pywhispercpp from https://files.pythonhosted.org/packages/43/4b/5b776a79d557392d1fb5e15ffa9af32cc214d3b396482c4b8d686727228b/pywhispercpp-1.1.3-cp39-cp39-macosx_10_9_universal2.whl.metadata
  Downloading pywhispercpp-1.1.3-cp39-cp39-macosx_10_9_universal2.whl.metadata (14 kB)
Collecting pydub (from pywhispercpp)
  Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading pywhispercpp-1.1.3-cp39-cp39-macosx_10_9_universal2.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pydub, pywhispercpp
Successfully installed pydub-0.25.1 pywhispercpp-1.1.3


In [24]:
from pywhispercpp.model import Model

model = Model('medium.en', n_threads=6)

start = time()
segments = model.transcribe('audio/3sec.wav', speed_up=True)
for segment in segments:
    print(segment.text)
print("durée 3 sec: ", time() - start, "s")

start = time()
segments = model.transcribe('audio/6sec.wav', speed_up=True)
for segment in segments:
    print(segment.text)
print("durée 6 sec: ", time() - start, "s")

start = time()
segments = model.transcribe('audio/10sec.wav', speed_up=True)
for segment in segments:
    print(segment.text)
print("durée 10 sec: ", time() - start, "s")

start = time()
segments = model.transcribe('audio/2min11.wav', speed_up=True)
for segment in segments:
    print(segment.text)
print("durée 2min11: ", time() - start, "s")

[2023-10-03 15:57:46,136] {utils.py:38} INFO - No download directory was provided, models will be downloaded to /Users/hugo/Library/Application Support/pywhispercpp/models
[2023-10-03 15:57:46,137] {utils.py:46} INFO - Model medium.en already exists in /Users/hugo/Library/Application Support/pywhispercpp/models
[2023-10-03 15:57:46,138] {model.py:221} INFO - Initializing the model ...
[2023-10-03 15:57:47,001] {model.py:130} INFO - Transcribing ...


whisper_init_from_file_no_state: loading model from '/Users/hugo/Library/Application Support/pywhispercpp/models/ggml-medium.en.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51864
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 1024
whisper_model_load: n_audio_head  = 16
whisper_model_load: n_audio_layer = 24
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 1024
whisper_model_load: n_text_head   = 16
whisper_model_load: n_text_layer  = 24
whisper_model_load: n_mels        = 80
whisper_model_load: ftype         = 1
whisper_model_load: type          = 4
whisper_model_load: mem required  = 1899.00 MB (+   43.00 MB per decoder)
whisper_model_load: adding 1607 extra tokens
whisper_model_load: model ctx     = 1462.35 MB
whisper_model_load: model size    = 1462.12 MB
whisper_init_state: kv self size  =   42.00 MB
whisper_init_state: kv cross size =  140.62 MB
whisper_full_with_state: progress =   5%
whisper_f

[2023-10-03 15:57:54,455] {model.py:133} INFO - Inference time: 7.454 s
No, I'd appreciate that, yeah.
durée 3 sec:  7.4557740688323975 s
[2023-10-03 15:57:54,457] {model.py:130} INFO - Transcribing ...
[2023-10-03 15:58:01,440] {model.py:133} INFO - Inference time: 6.983 s
Yeah, you wanna see my supervisor?
Huh? Yeah, you wanna see my supervisor? Fine, I'll be right back!
durée 6 sec:  6.985399007797241 s
[2023-10-03 15:58:01,443] {model.py:130} INFO - Transcribing ...
[2023-10-03 15:58:08,735] {model.py:133} INFO - Inference time: 7.292 s


whisper_full_with_state: progress =   5%
whisper_full_with_state: progress =  10%
whisper_full_with_state: progress =  15%
whisper_full_with_state: progress =  20%
whisper_full_with_state: progress =  25%
whisper_full_with_state: progress =  30%
whisper_full_with_state: progress =  35%
whisper_full_with_state: progress =  40%
whisper_full_with_state: progress =  45%
whisper_full_with_state: progress =  50%
whisper_full_with_state: progress =  55%
whisper_full_with_state: progress =  60%
whisper_full_with_state: progress =  65%
whisper_full_with_state: progress =  70%
whisper_full_with_state: progress =  75%
whisper_full_with_state: progress =  80%
whisper_full_with_state: progress =  85%
whisper_full_with_state: progress =  90%
whisper_full_with_state: progress =  95%
whisper_full_with_state: progress =   5%
whisper_full_with_state: progress =  10%
whisper_full_with_state: progress =  15%
whisper_full_with_state: progress =  20%
whisper_full_with_state: progress =  25%
whisper_full_wit

Yes, but my wallet was stolen. I don't have anything. I don't have any credit cards. I don't have my ID.
Don't you have things on file here?
durée 10 sec:  7.294188737869263 s
[2023-10-03 15:58:08,747] {model.py:130} INFO - Transcribing ...
[2023-10-03 15:58:37,385] {model.py:133} INFO - Inference time: 28.638 s
So what's up? What's new?
Well, Vegas was awesome.
Yeah, I heard.
And I got married.
Shut up. In Vegas?
Yeah, in the old town part.
Who'd you marry?
Jack!
Did he propose to you?
Yes, it was very romantic.
It was at the slot machines.
Oh, real fortune slot?
He went big and he realized that the only thing that would make it better was me as his bride.
He turned to you and was like...
Hey, let's get married.
That's really romantic.
Yeah, well, you know, cause he's leaving the next day.
But we're gonna have a honeymoon cruise.
Does that mean we're gonna get citizenship too in England or whatever?
Oh, I hadn't even thought about that.
Yeah, think about that.
I'm not gonna be a citiz

whisper_full_with_state: progress =   5%
whisper_full_with_state: progress =  10%
whisper_full_with_state: progress =  15%
whisper_full_with_state: progress =  20%
whisper_full_with_state: progress =  25%
whisper_full_with_state: progress =  30%
whisper_full_with_state: progress =  35%
whisper_full_with_state: progress =  40%
whisper_full_with_state: progress =  45%
whisper_full_with_state: progress =  50%
whisper_full_with_state: progress =  55%
whisper_full_with_state: progress =  60%
whisper_full_with_state: progress =  65%
whisper_full_with_state: progress =  70%
whisper_full_with_state: progress =  75%
whisper_full_with_state: progress =  80%
whisper_full_with_state: progress =  85%
whisper_full_with_state: progress =  90%
whisper_full_with_state: progress =  95%


## With Whisper Jax

from [git](https://github.com/sanchit-gandhi/whisper-jax)

In [None]:
from whisper_jax import FlaxWhisperPipline

# instantiate pipeline
pipeline = FlaxWhisperPipline("openai/whisper-large-v2")

# JIT compile the forward call - slow, but we only do once
text = pipeline("audio.mp3")

# used cached function thereafter - super fast!!
text = pipeline("audio.mp3")

In [None]:
from whisper_jax import FlaxWhisperPipline
import jax.numpy as jnp

# instantiate pipeline in bfloat16
pipeline = FlaxWhisperPipline("openai/whisper-large-v2", dtype=jnp.bfloat16)

Time : 
Long file -> 
Small file -> 