<a href="https://colab.research.google.com/github/danielmlow/tutorials/blob/main/speech/diarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Diarization

Here is an alternative tutorial: https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/applying_a_pipeline.ipynb



In [None]:
# install pyannote package
!pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip


In [None]:
import pyannote.audio
print(pyannote.audio.__version__) # Use Python 3.10.12 and pyannote-audio==2.1.1

!python --version

# Load packages and provide access to your google drive as working directory for colab, or use local paths

In [None]:
'''
Authors: Daniel M. Low
License: See license in github repository
'''

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from pyannote.audio import Pipeline
from IPython import display
import ipywidgets as widgets


pd.set_option("display.max_columns", None)
# pd.options.display.width = 0


on_colab = True

if on_colab:
  from google.colab import drive
  project_name = 'blake'
  drive.mount('/content/drive')
  input_dir = f'/content/drive/MyDrive/datum/{project_name}/data/input/samples_freespeech/'
  output_dir = f'/content/drive/MyDrive/datum/{project_name}/data/output/'
else:
  input_dir = './data/blake_16khz_remove_ra/'
  output_dir = './data/'

os.makedirs(output_dir, exist_ok=True)


In [None]:
# 1. visit hf.co/pyannote/speaker-diarization and accept user conditions
# 2. visit hf.co/pyannote/segmentation and accept user conditions
# 3. visit hf.co/settings/tokens to create an access token
# 4. instantiate pretrained speaker diarization pipeline
pyannote_token = '' # Add access token here as a string


files = os.listdir(input_dir)
files


In [None]:
# download pretrained model
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",
                                    use_auth_token=pyannote_token)



In [None]:
%%time
#Takes about 5 minutes per minute of recording using the colab T4 GPU.

for file in files[:2]: #just do with 2 examples
  # apply pretrained pipeline
  print(file)
  display.display(display.Audio(input_dir+file, autoplay=False), widgets.Textarea('')) # Display speech
  diarization = pipeline(input_dir+file, min_speakers =1, max_speakers = 2) # or just use num_speakers if known (or speakers if that doesnt work) and remove min_speakers and max_speakers

  # print the result (you can write to a txt, append to list, etc.)
  for turn, _, speaker in diarization.itertracks(yield_label=True):
      print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

  with open(output_dir+"audio.rttm", "w") as rttm:
      diarization.write_rttm(rttm)