In [39]:
from dotenv import load_dotenv
import os

load_dotenv()
 
target         = "audio_session_3"
audio_file     = f"../recordings/{target}.wav"
access_token   = os.getenv("hf_token")
latest_ds_path = "../recordings/latest_ds"
language       = "es"
source_hf      = False

In [24]:
from pyannote.audio import Pipeline
import torch
import torchaudio
from pyannote.audio.pipelines.utils.hook import ProgressHook

In [None]:
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    token = access_token
)
# Processing on GPU using the Metal Performance Shaders of Apple M series: https://developer.apple.com/metal/pytorch/ 
pipeline.to(torch.device("mps"))
# diarization = pipeline(audio_file)
waveform, sample_rate = torchaudio.load(audio_file)
with ProgressHook() as hook:
    diarization = pipeline({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)


In [11]:
for segment, _, speaker in diarization.itertracks(yield_label=True):
    print(f"Speaker {speaker} speaks from {segment.start:.2f}s to {segment.end:.2f}s")

Speaker SPEAKER_05 speaks from 1.38s to 2.43s
Speaker SPEAKER_01 speaks from 2.92s to 8.67s
Speaker SPEAKER_01 speaks from 8.99s to 15.86s
Speaker SPEAKER_01 speaks from 16.26s to 33.80s
Speaker SPEAKER_01 speaks from 34.34s to 36.55s
Speaker SPEAKER_01 speaks from 38.34s to 39.59s
Speaker SPEAKER_01 speaks from 40.21s to 54.23s
Speaker SPEAKER_01 speaks from 55.06s to 62.52s
Speaker SPEAKER_01 speaks from 63.11s to 79.44s
Speaker SPEAKER_01 speaks from 80.02s to 83.71s
Speaker SPEAKER_01 speaks from 84.14s to 93.18s
Speaker SPEAKER_01 speaks from 93.59s to 94.60s
Speaker SPEAKER_01 speaks from 95.19s to 103.69s
Speaker SPEAKER_01 speaks from 104.05s to 109.26s
Speaker SPEAKER_01 speaks from 109.67s to 110.97s
Speaker SPEAKER_01 speaks from 111.52s to 113.35s
Speaker SPEAKER_01 speaks from 113.79s to 116.30s
Speaker SPEAKER_01 speaks from 116.84s to 123.78s
Speaker SPEAKER_01 speaks from 124.05s to 130.17s
Speaker SPEAKER_01 speaks from 130.74s to 132.35s
Speaker SPEAKER_01 speaks from

In [12]:
from pydub import AudioSegment

# I needed to run: pip3 install audioop-lts
# See: https://github.com/jiaaro/pydub/issues/725#issuecomment-2439291764

audio = AudioSegment.from_file(audio_file)


In [13]:
for i, (segment, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
    start_ms = int(segment.start * 1000)  # convert seconds to milliseconds
    end_ms = int(segment.end * 1000)
    speaker_audio = audio[start_ms:end_ms]
    speaker_audio.export(f"{latest_ds_path}/chunk_{i}_{speaker}.wav", format="wav")

In [None]:
# Defining a path to ffmpeg binary
# Installed using homebrew for MacOS
# In zsh terminals, homebrew needs to be manually added to the PATH.

ffmpeg_path = "/opt/homebrew/bin/ffmpeg"  
os.environ['PATH'] += f':{os.path.dirname(ffmpeg_path)}'

In [40]:
if not source_hf:
    import whisper
    openai_model  = whisper.load_model("turbo")   # MPS acceleration not supported

# If following error appears:
# urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate , 
# Do the following:
# 1. Open the folder /Applications/Python 3.x (x is the version you are running).
# 2. Double click the Install Certificates.command. It will open a terminal and install the certificate.

In [27]:
if source_hf:
    from transformers import WhisperForConditionalGeneration
    from transformers import WhisperFeatureExtractor
    from transformers import WhisperTokenizer
    from transformers import pipeline as tpipe

    feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v3-turbo")
    tokenizer = WhisperTokenizer.from_pretrained(
        "openai/whisper-large-v3-turbo", 
        language = "spanish", 
        task     = "transcribe"
    )

    hf_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3-turbo")
    forced_decoder_ids = tokenizer.get_decoder_prompt_ids(
        language = "spanish", 
        task     = "transcribe"
    )

    hf_pipe = tpipe(
        "automatic-speech-recognition",
        model = hf_model,
        feature_extractor = feature_extractor,
        tokenizer = tokenizer,
        device    = "mps"
    )

Device set to use mps


In [41]:
import re

def get_transcript(audio_chunk, source_hf):

    speaker_id = re.search("(?<=SPEAKER_).+(?=\\.wav)", audio_chunk).group(0)
    speaker = f"SPEAKER_{speaker_id}"

    if source_hf:
        result = hf_pipe(
            f"{latest_ds_path}/{audio_chunk}"
        )
    else:
        result = openai_model.transcribe(
            f"{latest_ds_path}/{audio_chunk}", 
            language = language, 
            fp16     = False, 
            verbose  = True 
        )

    return(f"{speaker}: {result["text"]}")

In [43]:
audio_chunks_list = os.listdir(latest_ds_path)
audio_chunks_list = [x for x in audio_chunks_list if ".wav" in x] 
sorted_chunks     = sorted(audio_chunks_list, key=lambda s: int(s.split('_')[1]))

transcripts_list  = [get_transcript(x, source_hf=source_hf) for x in sorted_chunks] 

In [44]:
full_transcript = "\n".join(transcripts_list)

In [37]:
with open(f"../transcripts/{target}.txt", "w") as f:
    f.write(full_transcript)

In [38]:
for f in sorted_chunks:
    os.remove(f"{latest_ds_path}/{f}")