# Vast Notebook for Diarization


In [1]:
import time
load_ts = time.time()

In [None]:
# install ffmpeg
# !conda install -y -q ffmpeg

In [None]:
# optional: confirm that ffmpeg is installed
# ! ffmpeg

In [None]:
# !pip install --user jiwer
# !pip install --user openai-whisper

In [None]:
# %pip install git+https://github.com/m-bain/whisperx.git

In [None]:
# !pip install pyannote.audio==3.0.1
# !pip uninstall onnxruntime
# !pip install --force-reinstall onnxruntime-gpu

In [24]:
HUGGINGFACE_TOKEN = ""

In [5]:
import gc
import json
import os
import whisperx
import gc 
from whisperx import load_align_model, align
from whisperx.diarize import DiarizationPipeline, assign_word_speakers

In [8]:
load_te = time.time()
load_duration = (load_te-load_ts) / 60.0
load_duration = float(f"{load_duration:.2f}")
print(f"load duration: {load_duration} min")

load duration: 10.21 min


In [61]:
# mp3_file = "./2016_07_15.mp3"
# print(f"mp3_file exists: {os.path.exists(mp3_file)}")

root_dir = "/workspace"
print(f"root dir exists:\t{os.path.exists(root_dir)}")
out_dir = os.path.join(root_dir, "transcripts")
podcast_dir = os.path.join(root_dir, "podcasts")

print(f"podcast dir exists:\t{os.path.exists(podcast_dir)}")
print(f"out dir exists:\t{os.path.exists(out_dir)}")

mp3_files = [os.path.join(podcast_dir, x) for x in os.listdir(podcast_dir) if x.endswith(".mp3")]
print(f"mp3_files: {mp3_files}")

root dir exists:	True
podcast dir exists:	True
out dir exists:	True
mp3_files: ['/workspace/podcasts/2023_09_08.mp3', '/workspace/podcasts/2023_09_15.mp3', '/workspace/podcasts/2023_09_22.mp3']


In [47]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        duration = (te-ts) / 60.0
        duration = float(f"{duration:.2f}")
        print(f"{method.__name__} running time: {duration} min ")
        return result
    return timed

In [62]:
%%time
device = "cuda" 
audio_file = mp3_files[0]
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)


CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 9.3 µs


In [63]:
audio_file

'/workspace/podcasts/2023_09_08.mp3'

In [64]:
# STEP 1 ------------------------------
# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("medium", device, compute_type=compute_type, language="en")
# model = whisper.load_model("medium")

audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
result = model.transcribe(audio, batch_size=batch_size, language="en")
print(result["segments"]) # before alignment

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
[{'text': ' Jeep Adventure Days is going on now. Hurry in for great deals on a great selection of Jeep brand vehicles. And right now, get 10% below MSRP for an average of $6,960 under MSRP on the purchase of a 2023 Jeep Wrangler 4xe. Not compatible with lease offers or with any other consumer incentive offers, $6,960 average based on 10% below average MSRP from all 2023 Wrangler 4xe models in dealer stock. Residency restrictions apply. Take retail delivery from dealer stock by 1031. Jeep is a registered trademark.', 'start': 0.503, 'end': 30.282}, {'text': " You clap better than anyone. I know, but I once did... No one normally gets to hear your clap, but we always start each recording with a clap, so that's Mark doing a clap, and my clap is slightl

In [21]:
# STEP 2 ------------------------------
# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:06<00:00, 54.2MB/s] 


[{'start': 0.543, 'end': 1.984, 'text': ' Jeep Adventure Days is going on now.', 'words': [{'word': 'Jeep', 'start': 0.543, 'end': 0.723, 'score': 0.576}, {'word': 'Adventure', 'start': 0.763, 'end': 1.083, 'score': 0.937}, {'word': 'Days', 'start': 1.103, 'end': 1.283, 'score': 0.65}, {'word': 'is', 'start': 1.344, 'end': 1.384, 'score': 0.996}, {'word': 'going', 'start': 1.424, 'end': 1.624, 'score': 0.886}, {'word': 'on', 'start': 1.684, 'end': 1.764, 'score': 0.8}, {'word': 'now.', 'start': 1.824, 'end': 1.984, 'score': 0.926}]}, {'start': 2.104, 'end': 4.866, 'text': 'Hurry in for great deals on a great selection of Jeep brand vehicles.', 'words': [{'word': 'Hurry', 'start': 2.104, 'end': 2.304, 'score': 0.971}, {'word': 'in', 'start': 2.324, 'end': 2.384, 'score': 0.887}, {'word': 'for', 'start': 2.424, 'end': 2.524, 'score': 0.943}, {'word': 'great', 'start': 2.564, 'end': 2.744, 'score': 0.752}, {'word': 'deals', 'start': 2.784, 'end': 3.005, 'score': 0.868}, {'word': 'on', 'st

In [27]:
# STEP 3 ------------------------------
# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=HUGGINGFACE_TOKEN, device=device)

# add min/max number of speakers if known
# diarize_segments = diarize_model(audio)
diarize_segments = diarize_model(audio, min_speakers=2)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result2 = whisperx.assign_word_speakers(diarize_segments, result)
# print(diarize_segments)

print(f'Speakers: {list(set([item["speaker"] for item in result2["segments"]]))}')
print(result2["segments"]) # segments are now assigned speaker IDs


Speakers: ['SPEAKER_15', 'SPEAKER_10', 'SPEAKER_18', 'SPEAKER_17', 'SPEAKER_05', 'SPEAKER_04', 'SPEAKER_20', 'SPEAKER_13', 'SPEAKER_09', 'SPEAKER_06', 'SPEAKER_01', 'SPEAKER_16', 'SPEAKER_14', 'SPEAKER_11', 'SPEAKER_03', 'SPEAKER_02', 'SPEAKER_00', 'SPEAKER_08', 'SPEAKER_12', 'SPEAKER_07', 'SPEAKER_19']
[{'start': 0.543, 'end': 1.984, 'text': ' Jeep Adventure Days is going on now.', 'words': [{'word': 'Jeep', 'start': 0.543, 'end': 0.723, 'score': 0.576, 'speaker': 'SPEAKER_15'}, {'word': 'Adventure', 'start': 0.763, 'end': 1.083, 'score': 0.937, 'speaker': 'SPEAKER_15'}, {'word': 'Days', 'start': 1.103, 'end': 1.283, 'score': 0.65, 'speaker': 'SPEAKER_15'}, {'word': 'is', 'start': 1.344, 'end': 1.384, 'score': 0.996, 'speaker': 'SPEAKER_15'}, {'word': 'going', 'start': 1.424, 'end': 1.624, 'score': 0.886, 'speaker': 'SPEAKER_15'}, {'word': 'on', 'start': 1.684, 'end': 1.764, 'score': 0.8, 'speaker': 'SPEAKER_15'}, {'word': 'now.', 'start': 1.824, 'end': 1.984, 'score': 0.926, 'speaker

In [46]:
import json


In [65]:

@timeit
def process_audio_file(audio_file):
    print(f"processing audio file: {audio_file}")
    # 1. Transcribe with original whisper (batched)
    model = whisperx.load_model("medium", device, compute_type=compute_type, language="en")
    audio = whisperx.load_audio(audio_file)
    result = model.transcribe(audio, batch_size=batch_size, language="en")
    # 2. Align whisper output
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    # 3. Assign speaker labels
    diarize_model = whisperx.DiarizationPipeline(use_auth_token="HUGGINGFACE_TOKEN", device=device)
    diarize_segments = diarize_model(audio, min_speakers=2)
    # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

    result2 = whisperx.assign_word_speakers(diarize_segments, result)
    # print(diarize_segments)

    # print(f'Speakers: {list(set([item["speaker"] for item in result2["segments"]]))}')
    # print(result2["segments"]) # segments are now assigned speaker IDs
    return result2


In [49]:
def list_to_text(result):
    segments = result["segments"]
    return json.dumps(segments)
    
def save_transcript(path, data):
    with open(path, "w") as fo:
        fo.write(data)


In [68]:
# mp3_files = [x for x in os.listdir(podcast_dir) if x.endswith(".mp3")]
# mp3_file = mp3_files[0]
# mp3_file_path = os.path.join(podcast_dir, mp3_file) 
# # print(mp3_file_path)
# result = process_audio_file(mp3_file_path)
# transcript = list_to_text(result)
# name = mp3_file.replace(".mp3", "")
# path = os.path.join(out_dir, name)
# print(f"saving transcript of {name}...")
# save_transcript(path, transcript)


processing audio file: /workspace/podcasts/2023_09_08.mp3


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
process_audio_file running time: 3.62 min 
saving transcript of 2023_09_08...


In [None]:
mp3_files = [x for x in os.listdir(podcast_dir) if x.endswith(".mp3")]

for mp3_file in mp3_files:
    print(f"processing mp3_file: {mp3_file} ...")
    mp3_file_path = os.path.join(podcast_dir, mp3_file) 
    result = process_audio_file(mp3_file)
    transcript = list_to_text(result)
    name = mp3_file.replace(".mp3", ".jsonl")
    path = os.path.join(out_dir, name)
    print(f"saving transcript of {name}...")
    save_transcript(path, transcript)
