In [4]:
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse, JSONResponse
from fastapi.encoders import jsonable_encoder
from tempfile import NamedTemporaryFile
from faster_whisper import WhisperModel
from pyannote.core import Segment, Annotation, Timeline
from pyannote.audio import Pipeline
# import torch
import os

model_size = "large-v2"

model = WhisperModel(model_size,# device="cuda",
                     compute_type="int8")

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")


In [5]:
#### Define diarization functions
def get_text_with_timestamp(transcribe_res):
    timestamp_texts = []
    for i in transcribe_res:
        start = i.start
        end = i.end
        text = i.text
        timestamp_texts.append((Segment(round(start,2), round(end,2)), text))
    return timestamp_texts


def add_speaker_info_to_text(timestamp_texts, diarization):
    spk_text = []
    for seg, text in timestamp_texts:
        spk = diarization.crop(seg).argmax()
        spk_text.append((seg, spk, text))
    return spk_text


def merge_cache(text_cache):
    sentence = ''.join([item[-1] for item in text_cache])
    spk = text_cache[0][1]
    start = text_cache[0][0].start
    end = text_cache[-1][0].end
    return Segment(start, end), spk, sentence


PUNC_SENT_END = ['.', '?', '!']

def merge_sentence(spk_text):
    merged_spk_text = []
    pre_spk = None
    text_cache = []
    for seg, spk, text in spk_text:
        if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = [(seg, spk, text)]
            pre_spk = spk

        elif text[-1] in PUNC_SENT_END:
            text_cache.append((seg, spk, text))
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = []
            pre_spk = spk
        else:
            text_cache.append((seg, spk, text))
            pre_spk = spk
    if len(text_cache) > 0:
        merged_spk_text.append(merge_cache(text_cache))
    return merged_spk_text

def diarize_text(transcribe_res, diarization_result):
    timestamp_texts = get_text_with_timestamp(transcribe_res)
    spk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)
    res_processed = merge_sentence(spk_text)
    return res_processed
###

In [None]:
os.environ["HUGGINGFACE_ACCESS_TOKEN"] = '<insert hf access token here>'

In [10]:
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.0",
    use_auth_token=os.environ["HUGGINGFACE_ACCESS_TOKEN"])

# pipeline = Pipeline.from_pretrained("config.yaml")

torchvision is not available - cannot save figures


In [11]:
file_path = '<insert .mp3 file path here>'

In [18]:
segments, info = model.transcribe(file_path, beam_size=5)
diarization = pipeline(file_path, num_speakers=2)

In [19]:
segments = list(segments)

In [20]:
output = diarize_text(segments, diarization)

In [25]:
output[0]

(<Segment(0, 20.72)>,
 'SPEAKER_00',
 ' In chess there are sort of two main strategies.')

In [38]:
output_formatted = []
for count, val in enumerate(output):
    # output_2[i][0] = str(output_2[i][0])
    segment_str = str(val[0])
    speaker = val[1]
    text = val[2]
    output_formatted.append((segment_str, speaker, text))

output_formatted

[('[ 00:00:00.000 -->  00:00:20.720]',
  'SPEAKER_00',
  ' In chess there are sort of two main strategies.'),
 ('[ 00:00:20.720 -->  00:00:24.320]',
  'SPEAKER_00',
  ' There are people who just play the same openings every single time.'),
 ('[ 00:00:24.320 -->  00:00:29.360]',
  'SPEAKER_00',
  " They're basically just saying, yeah, I trust in my prep."),
 ('[ 00:00:29.360 -->  00:00:31.240]',
  'SPEAKER_00',
  ' I trust in my strategies.'),
 ('[ 00:00:31.240 -->  00:00:36.400]',
  'SPEAKER_00',
  ' There are others who try and adjust it more.'),
 ('[ 00:00:36.400 -->  00:00:37.920]',
  'SPEAKER_00',
  ' And frankly, I like both.'),
 ('[ 00:00:37.920 -->  00:00:39.800]',
  'SPEAKER_00',
  " That's a part of football as well."),
 ('[ 00:00:39.800 -->  00:00:47.400]',
  'SPEAKER_01',
  " There's some prep, you know, potentially what your opponent could do, what kind of your plan going into the game, how you want to attack, how you want to defend, how you want to press."),
 ('[ 00:00:47.