In [12]:
# Install necessary libraries
%pip install git+https://github.com/m-bain/whisperx.git


Collecting git+https://github.com/m-bain/whisperx.git
  Cloning https://github.com/m-bain/whisperx.git to /private/var/folders/8r/2hn86n416n58v77nhrr2_mhw0000gn/T/pip-req-build-dayj0auq
  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperx.git /private/var/folders/8r/2hn86n416n58v77nhrr2_mhw0000gn/T/pip-req-build-dayj0auq
  Resolved https://github.com/m-bain/whisperx.git to commit 58f00339af7dcc9705ef49d97a1f40764b7cf555
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [13]:
# What version of Python do you have?
import sys
import platform
import torch
import os
import whisperx

has_gpu = torch.cuda.is_available()
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"

print(f"Python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print()
print(f"Python {sys.version}")
print("NVIDIA/CUDA GPU is", "available" if has_gpu else "NOT AVAILABLE")
print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
print(f"Target device is {device}")

torch.cuda.empty_cache()

Python Platform: macOS-14.5-arm64-arm-64bit
PyTorch Version: 2.4.0

Python 3.11.9 (main, Apr 19 2024, 11:43:47) [Clang 14.0.6 ]
NVIDIA/CUDA GPU is NOT AVAILABLE
MPS (Apple Metal) is AVAILABLE
Target device is mps


In [14]:
language = "en"
model_size = "small"  # "large-v3" recommended for production
batch_size = 16  # reduce if low on GPU mem
compute_type = "int8"  # change to "int8" if low on GPU mem (may reduce accuracy)

In [15]:
model = whisperx.load_model(
    model_size, "cpu", language=language, compute_type=compute_type
)

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.0. Bad things might happen unless you revert torch to 1.x.


In [16]:
audio_path = "./mp3_audio_files/audio.mp3"
audio = whisperx.load_audio(audio_path)
result = model.transcribe(audio_path, batch_size=batch_size, language=language)

In [17]:
max_segments = 30  # max number of segments to print
for segment in (
    result["segments"][: max_segments // 2] + result["segments"][-max_segments // 2 :]
):
    print(
        f"{segment['start']:7.2f},{segment['end']:7.2f}, \"{segment['text'].strip()}\""
    )

   2.16,  21.37, "Retrieval augmented generation, or RAG, has become a key method for getting LMS to answer questions over a user's own data. But to actually build and productionize a high-quality RAG system, it helps a lot to have effective retrieval techniques to give the LMS highly relevant context to generate its answer."
  21.37,  43.83, "and also to have an effective evaluation framework to help you efficiently iterate and improve your RAG system both during initial development and during post deployment maintenance. This course covers two advanced retrieval methods, sentence window retrieval and auto-emerging retrieval that deliver a significantly better context to BLM than simpler methods."
  43.83,  63.22, "It also covers how to evaluate your LLM question-answering system with three evaluation metrics, contact relevance, droughtedness, and answer relevance. I'm excited to introduce Jerry Liu, co-founder and CO of Larmor and Text, and Andrew Plumdata, co-founder and chief scien

In [18]:
align_language = result["language"]
model_a, metadata = whisperx.load_align_model(
    language_code=align_language, device=device
)
result = whisperx.align(
    result["segments"], model_a, metadata, audio, device, return_char_alignments=False
)
result["language"] = align_language

In [19]:
max_segments = 30  # max number of segments to print
for segment in (
    result["segments"][: max_segments // 2] + result["segments"][-max_segments // 2 :]
):
    print(
        f"{segment['start']:7.2f},{segment['end']:7.2f}, \"{segment['text'].strip()}\""
    )

   2.22,   9.56, "Retrieval augmented generation, or RAG, has become a key method for getting LMS to answer questions over a user's own data."
  10.30,  20.81, "But to actually build and productionize a high-quality RAG system, it helps a lot to have effective retrieval techniques to give the LMS highly relevant context to generate its answer."
  21.51,  32.30, "and also to have an effective evaluation framework to help you efficiently iterate and improve your RAG system both during initial development and during post deployment maintenance."
  32.64,  43.27, "This course covers two advanced retrieval methods, sentence window retrieval and auto-emerging retrieval that deliver a significantly better context to BLM than simpler methods."
  43.97,  53.23, "It also covers how to evaluate your LLM question-answering system with three evaluation metrics, contact relevance, droughtedness, and answer relevance."
  54.27,  62.42, "I'm excited to introduce Jerry Liu, co-founder and CO of Larmor 