In [1]:
!source ~/.bashrc

In [2]:
import io
from dotenv import load_dotenv
import replicate
import os
from pydub import AudioSegment
import time
import torch
from faster_whisper import WhisperModel

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# compute_type = "float16" if torch.cuda.is_available() else "float32"
compute_type = "int8"

# load model on GPU if available, else cpu
model = WhisperModel("distil-large-v3", device=device, compute_type=compute_type)

In [4]:
device

'cuda'

In [5]:
def measure_transcription_speed(mp3_file):
    audio = AudioSegment.from_mp3(mp3_file)
    audio_length = len(audio)
    seconds = audio_length / 1000    
    with open(mp3_file, "rb") as f:
        audio_blob = io.BytesIO(f.read())  # Use BytesIO to create a file-like object
    start_time = time.time()
    # fast whisper large 3
    final_transcription = ""
    segments, info = model.transcribe(mp3_file, beam_size=1)
    for segment in segments:
      final_transcription += segment.text
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Time to transcribe an audio clip of length {seconds} s is: {execution_time} seconds. Ration of {seconds / execution_time}")


In [6]:
files = [
'audio/files/episode_0_sample.mp3',
'audio/files/episode_1_sample.mp3',
'audio/files/episode_2_sample.mp3',
'audio/files/episode_3_sample.mp3',
'audio/files/episode_5_sample.mp3',
'audio/files/episode_8_sample.mp3',
'audio/files/episode_4_sample.mp3',
'audio/files/episode_6_sample.mp3',
'audio/files/episode_7_sample.mp3',
'audio/files/episode_9_sample.mp3',
]

In [1]:
for mp3_file in files:
    measure_transcription_speed(mp3_file)

Time to transcribe an audio clip of length 480.0 s is: 10.311173677444458 seconds. Ration of 46.551441670504786
Time to transcribe an audio clip of length 69.094 s is: 1.3887224197387695 seconds. Ration of 49.75364336164254
Time to transcribe an audio clip of length 480.0 s is: 8.750830173492432 seconds. Ration of 54.851938671372174
Time to transcribe an audio clip of length 300.0 s is: 5.608936786651611 seconds. Ration of 53.48607256796919
Time to transcribe an audio clip of length 180.0 s is: 3.3599979877471924 seconds. Ration of 53.57146065456015
Time to transcribe an audio clip of length 120.0 s is: 2.1051430702209473 seconds. Ration of 57.00325155924214
Time to transcribe an audio clip of length 240.0 s is: 4.228495359420776 seconds. Ration of 56.75777779093399
Time to transcribe an audio clip of length 360.0 s is: 6.47335958480835 seconds. Ration of 55.61254481287373
Time to transcribe an audio clip of length 420.0 s is: 7.163331985473633 seconds. Ration of 58.6319328563452
Time 