# openai whisper + pyannote

* https://github.com/openai/whisper
* https://github.com/pyannote/pyannote-audio

In [1]:
import datetime, glob, os
import subprocess
# send pipeline to GPU (when available)
import torch
import whisper
import pyannote.audio
from sklearn.cluster import AgglomerativeClustering
from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## whisper speech to text

In [2]:
def run_whisper():
	files = glob.glob('*.mp4')
	cmd = ''
	for f in files:
		name=os.path.splitext(f)[0]
		print (name)
		if os.path.exists(name):
			continue
		cmd += 'whisper {} --device cuda --model medium --language en -o {} && '.format(f,name)
		#print (cmd)
		#subprocess.check_output(cmd,shell=True)

## basic diarization with pyannote

In [None]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.0",num_speakers=2,
    use_auth_token="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")
pipeline.to(torch.device("cuda"))
# apply pretrained pipeline
diarization = pipeline("audio.wav")

# print the result
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

In [11]:
with open("audio.rttm", "w") as rttm:
    diarization.write_rttm(rttm)

## extract speakers with whisper/pyannote

https://colab.research.google.com/drive/1V-Bt5Hm2kjaDb4P1RyMSswsDKyrzc2-3?usp=sharing#scrollTo=buGt4moR5Mac

In [2]:
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding( 
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

In [3]:
language = 'English' #@param ['any', 'English']
model_size = 'medium' #@param ['tiny', 'base', 'small', 'medium', 'large']
model = whisper.load_model('medium.en')

In [4]:
def extract_speakers(model, path, num_speakers=2):
    """Do diarization with speaker names"""
    
    mono = 'mono.wav'
    cmd = 'ffmpeg -i {} -y -ac 1 mono.wav'.format(path)
    subprocess.check_output(cmd, shell=True)
    result = model.transcribe(mono)
    segments = result["segments"]
    
    with contextlib.closing(wave.open(mono,'r')) as f:
      frames = f.getnframes()
      rate = f.getframerate()
      duration = frames / float(rate)
        
    audio = Audio()
    def segment_embedding(segment):
        start = segment["start"]
        # Whisper overshoots the end timestamp in the last segment
        end = min(duration, segment["end"])
        clip = Segment(start, end)
        waveform, sample_rate = audio.crop(mono, clip)
        return embedding_model(waveform[None])

    embeddings = np.zeros(shape=(len(segments), 192))
    for i, segment in enumerate(segments):
      embeddings[i] = segment_embedding(segment)
    embeddings = np.nan_to_num(embeddings)
    
    clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
    labels = clustering.labels_
    for i in range(len(segments)):
      segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
    return segments    

def write_segments(segments, outfile):
    """write out segments to file"""
    
    def time(secs):
      return datetime.timedelta(seconds=round(secs))
    
    f = open(outfile, "w")    
    for (i, segment) in enumerate(segments):
      if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
        f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
      f.write(segment["text"][1:] + ' ')
    f.close()


In [None]:
seg = extract_speakers(model, 'vidal.wav')
write_segments(seg, 'transcript.txt')

## convert mp4 files

In [None]:
files = glob.glob('*.wav')
for f in files:
    name=os.path.splitext(f)[0]        
    out = '%s.txt' %name
    if not os.path.exists(out):
        print (name)
        seg = extract_speakers(model, f)
        write_segments(seg, out)