In [2]:
from pyannote.audio import Pipeline
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()
 
target         = "audio_session_3"
audio_file     = "../recordings/{target}.wav"
access_token   = os.getenv("hf_token")
latest_ds_path = "../recordings/latest_ds"
language       = "en"

In [None]:
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    token = access_token
)
diarization = pipeline(audio_file)

  std = sequences.std(dim=-1, correction=1)


In [19]:
for segment, _, speaker in diarization.itertracks(yield_label=True):
    print(f"Speaker {speaker} speaks from {segment.start:.2f}s to {segment.end:.2f}s")

Speaker SPEAKER_00 speaks from 1.80s to 2.65s
Speaker SPEAKER_01 speaks from 2.76s to 3.64s
Speaker SPEAKER_00 speaks from 3.95s to 6.12s
Speaker SPEAKER_01 speaks from 6.44s to 10.27s
Speaker SPEAKER_00 speaks from 10.27s to 12.27s
Speaker SPEAKER_01 speaks from 12.55s to 16.28s
Speaker SPEAKER_00 speaks from 16.43s to 19.99s
Speaker SPEAKER_00 speaks from 21.16s to 21.24s
Speaker SPEAKER_01 speaks from 21.24s to 22.22s
Speaker SPEAKER_01 speaks from 22.80s to 25.87s
Speaker SPEAKER_00 speaks from 25.38s to 25.51s


In [None]:
from pydub import AudioSegment

# I needed to run: pip3 install audioop-lts
# See: https://github.com/jiaaro/pydub/issues/725#issuecomment-2439291764

audio = AudioSegment.from_file(audio_file)


In [85]:
for i, (segment, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
    start_ms = int(segment.start * 1000)  # convert seconds to milliseconds
    end_ms = int(segment.end * 1000)
    speaker_audio = audio[start_ms:end_ms]
    speaker_audio.export(f"{latest_ds_path}/chunk_{i}_{speaker}.wav", format="wav")

In [None]:
import whisper

# Defining a path to ffmpeg binary
# Installed using homebrew for MacOS
# In zsh terminals, homebrew needs to be manually added to the PATH.

ffmpeg_path = "/opt/homebrew/bin/ffmpeg"  
os.environ['PATH'] += f':{os.path.dirname(ffmpeg_path)}'

In [12]:
model  = whisper.load_model("turbo")

# If following error appears:
# urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate , 
# Do the following:
# 1. Open the folder /Applications/Python 3.x (x is the version you are running).
# 2. Double click the Install Certificates.command. It will open a terminal and install the certificate.

In [70]:
import re

def get_transcript(audio_chunk):

    speaker_id = re.search("(?<=SPEAKER_).+(?=\\.wav)", audio_chunk).group(0)
    speaker = f"SPEAKER_{speaker_id}"

    result = model.transcribe(
        f"{latest_ds_path}/{audio_chunk}", 
        language = language, 
        fp16     = False, 
        verbose  = True
    )

    return(f"{speaker}: {result["text"]}")

In [None]:
audio_chunks_list = os.listdir(latest_ds_path)
audio_chunks_list = [x for x in audio_chunks_list if ".wav" in x] 
sorted_chunks     = sorted(audio_chunks_list, key=lambda s: int(s.split('_')[1]))

transcripts_list  = [get_transcript(x) for x in sorted_chunks] 

[00:00.000 --> 00:00.840]  Good morning.
[00:00.000 --> 00:00.660]  Hi, good morning.
[00:00.000 --> 00:02.340]  Can you believe how much snow is outside?
[00:00.000 --> 00:03.480]  It looks like a lot. I can't believe they were accurate with their predictions.
[00:00.000 --> 00:01.720]  I know. Have you been outside?
[00:00.000 --> 00:03.560]  I have. Your cat was hiding underneath the stairs out back.
[00:00.000 --> 00:01.240]  Oh my goodness.
[00:02.360 --> 00:03.560]  I think we should go sledding.
[00:00.000 --> 00:29.980]  Thank you.
[00:00.000 --> 00:00.500]  Hmm.
[00:00.000 --> 00:01.480]  I don't feel like slitting.
[00:02.480 --> 00:02.900]  Thank you.
[00:00.000 --> 00:03.020]  this


In [74]:
full_transcript = "\n".join(transcripts_list)

In [80]:
with open(f"../transcripts/{target}.txt", "w") as f:
    f.write(full_transcript)

In [86]:
for f in sorted_chunks:
    os.remove(f"{latest_ds_path}/{f}")