In [3]:
import io
from dotenv import load_dotenv
import replicate
import os
from pydub import AudioSegment
import time
import torch
from faster_whisper import WhisperModel

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# compute_type = "float16" if torch.cuda.is_available() else "float32"
compute_type = "int8"

# load model on GPU if available, else cpu
model = WhisperModel("distil-large-v3", device=device, compute_type=compute_type)

In [5]:
files = [
'audio/files/episode_0_sample.mp3',
'audio/files/episode_1_sample.mp3',
'audio/files/episode_2_sample.mp3',
'audio/files/episode_3_sample.mp3',
'audio/files/episode_5_sample.mp3',
'audio/files/episode_8_sample.mp3',
'audio/files/episode_4_sample.mp3',
'audio/files/episode_6_sample.mp3',
'audio/files/episode_7_sample.mp3',
'audio/files/episode_9_sample.mp3',
]



In [7]:
for mp3_file in files:
    
    audio = AudioSegment.from_mp3(mp3_file)
    audio_length = len(audio)
    seconds = audio_length / 1000
    
    with open(mp3_file, "rb") as f:
        audio_blob = io.BytesIO(f.read())  # Use BytesIO to create a file-like object
    
    start_time = time.time()
    
    # fast whisper large 3
    final_transcription = ""
    segments, info = model.transcribe(mp3_file, beam_size=1)
    
    for segment in segments:
      final_transcription += segment.text

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Time to transcribe an audio clip of length {seconds} s is: {execution_time} seconds. Ration of {seconds / execution_time}")


Time to transcribe an audio clip of length 480.0 s is: 230.79495215415955 seconds. Ration of 2.0797681904211833
Time to transcribe an audio clip of length 69.094 s is: 38.38592600822449 seconds. Ration of 1.7999826286643719
Time to transcribe an audio clip of length 480.0 s is: 259.86561822891235 seconds. Ration of 1.8471085296754188
Time to transcribe an audio clip of length 300.0 s is: 186.14458298683167 seconds. Ration of 1.6116504449727809
Time to transcribe an audio clip of length 180.0 s is: 103.46362376213074 seconds. Ration of 1.739741886615446
Time to transcribe an audio clip of length 120.0 s is: 71.15221548080444 seconds. Ration of 1.6865251375394459
Time to transcribe an audio clip of length 240.0 s is: 117.26437067985535 seconds. Ration of 2.0466574681514
Time to transcribe an audio clip of length 360.0 s is: 182.50044226646423 seconds. Ration of 1.9725979593757543
Time to transcribe an audio clip of length 420.0 s is: 210.15456867218018 seconds. Ration of 1.99852900012446