## This notebook does the diarization and has functions for joining files and extracting text for replacement

It uses:
- [pyannote](https://pypi.org/project/pyannote.audio/) (and its associared dependencies)

You will need to download and install microsoft/Phi-3-medium-4k-instruct-onnx-directml and update the model_path below. If you do not have GPU or are not using Windows, see the Phi-3 docs and set yourself up accordingly.

Pyannote.audio is gated on huggingface and requires an account and access key. See https://huggingface.co/pyannote/speaker-diarization-3.1 for instructions.

In [None]:
%pip install --upgrade pyannote.audio

In [2]:
from pyannote.audio import Pipeline
import os
import shutil
import torch

diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="[YOUR HF KEY HERE]")

# Send pipeline to GPU (when available)
diarization_pipeline.to(torch.device("cuda"))

def process_audio(file_path, output_folder):
    # Check if the file name contains spaces and handle it
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    if ' ' in base_name:
        # Create a temporary file name by removing spaces
        temp_file_path = os.path.join(os.path.dirname(file_path), base_name.replace(' ', '') + os.path.splitext(file_path)[1])
        # Copy the original file to the new file with spaces removed
        shutil.copyfile(file_path, temp_file_path)
        # Use the new file path for processing
        file_path = temp_file_path
    else:
        temp_file_path = None

    # Perform diarization using the provided file path
    diarization = diarization_pipeline(file_path)
    
    # Generate the output filename based on the input file's original name
    output_file = os.path.join(output_folder, f"{base_name}.rttm")
    
    # Write the diarization output to disk using RTTM format
    with open(output_file, "w") as rttm:
        diarization.write_rttm(rttm)

    # Cleanup: if a temporary file was used, delete it
    if temp_file_path:
        os.remove(temp_file_path)

    print(f"Processed {file_path}, output to {output_file}")

def process_folder(input_folder, output_folder):
    # List all files in the given folder
    for file_name in os.listdir(input_folder):
        # Check if the file is an MP3 file
        if file_name.endswith(".mp3"):
            file_path = os.path.join(input_folder, file_name)
            process_audio(file_path, output_folder)

process_folder('./Audio', './Transcript')


The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


Processed ./Audio\Developer’sGuidetoCustomizingMicrosoftCopilot-MicrososftBuild2024.mp3, output to ./Transcript\Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.rttm


In [1]:
import os
import json

def read_json(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        return json.load(file)

def read_rttm(rttm_file):
    rttm_data = []
    with open(rttm_file, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            rttm_data.append({
                "turn_onset": float(parts[3]),
                "duration": float(parts[4]),
                "speaker": parts[7]
            })
    return rttm_data

def seconds_to_hms(seconds):
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = seconds % 60
    return f"{h:02}:{m:02}:{s:06.3f}"

def match_speaker(json_start, json_end, rttm_data):
    best_match = None
    max_overlap = -1
    
    for entry in rttm_data:
        rttm_start = entry["turn_onset"]
        rttm_end = rttm_start + entry["duration"]

        overlap_start = max(json_start, rttm_start)
        overlap_end = min(json_end, rttm_end)
        overlap_duration = max(0, overlap_end - overlap_start)
        
        if overlap_duration > max_overlap:
            max_overlap = overlap_duration
            best_match = entry["speaker"]

    return best_match if best_match else "Unknown"

def process_files(json_file, rttm_file, output_file):
    json_data = read_json(json_file)
    rttm_data = read_rttm(rttm_file)
    
    with open(output_file, 'w', encoding='utf-8') as text_file:
        previous_end = None  # Track the end time of the previous item
        for i, chunk in enumerate(json_data):
            start = chunk['timestamp'][0]
            if len(chunk['timestamp']) > 1:
                end = chunk['timestamp'][1]
            
            if end == None:
                if i == len(json_data) - 1 and previous_end is not None:  # Last item with no end time
                    end = previous_end + 5  # Assuming a reasonable default duration
                elif i == len(json_data) - 1 and previous_end is None:  # Single item list
                    end = start + 5  # Default duration for single item lists
                else:
                    continue  # If not the last item and no end time, skip
            
            previous_end = end  # Update the previous end time
            speaker = match_speaker(start, end, rttm_data)
            formatted_timestamp = f"({seconds_to_hms(start)}, {seconds_to_hms(end)})"
            text_file.write(f"[{speaker}] : [{formatted_timestamp}] : {chunk['text']}\n")

def process_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            base_name = os.path.splitext(file_name)[0]
            json_file = os.path.join(folder_path, file_name)
            rttm_file = os.path.join(folder_path, base_name + '.rttm')
            output_file = os.path.join(folder_path, base_name + '.transcript.txt')
            
            if os.path.exists(rttm_file):
                process_files(json_file, rttm_file, output_file)
                print(f"Processed {json_file} and {rttm_file} into {output_file}")
            else:
                print(f"Warning: No RTTM file found for {json_file}. Skipping.")

process_folder('./Transcript')


Processed ./Transcript\Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.json and ./Transcript\Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.rttm into ./Transcript\Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.transcript.txt


In [2]:
import re

def list_unique_speakers(file_path):
    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()
    
    # Find all unique speaker identifiers
    speakers = re.findall(r"\[SPEAKER_(\d{2})\]", content)
    unique_speakers = sorted(set(speakers))
    
    # Format speakers in the form "SPEAKER_XX"
    formatted_speakers = [f"SPEAKER_{speaker}" for speaker in unique_speakers]
    return formatted_speakers


print(list_unique_speakers('./Transcript/Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.transcript.txt'))


['SPEAKER_00', 'SPEAKER_01']


In [5]:
def get_context_around_speaker(file_path, speaker):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Find the first occurrence of the speaker
    for i, line in enumerate(lines):
        if speaker in line:
            start_index = max(0, i - 5)  # Ensure start index is within bounds
            end_index = min(len(lines), i + 6)  # Ensure end index is within bounds (i+6 for inclusive slicing)
            return lines[start_index:end_index]
    return []  # Return an empty list if the speaker does not appear

print(get_context_around_speaker('./Transcript/Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.transcript.txt', list_unique_speakers('./Transcript/Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.transcript.txt')[0]))
print(get_context_around_speaker('./Transcript/Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.transcript.txt', list_unique_speakers('./Transcript/Developer’s Guide to Customizing Microsoft Copilot - Micrososft Build 2024.transcript.txt')[1]))

["[SPEAKER_00] : [(00:00:00.000, 00:00:23.660)] :  All right, everybody. Welcome, welcome, welcome, welcome. First breakout of Build. I'm Barnum Bora, and I lead the developer advocacy team for Microsoft 365 and Copilot Platform.\n", '[SPEAKER_00] : [(00:00:23.660, 00:00:24.340)] :  advocacy team for Microsoft 365 and Copilot platform.\n', '[SPEAKER_00] : [(00:00:27.920, 00:00:28.120)] :  But without further ado, before I come back and do more things,\n', "[SPEAKER_00] : [(00:00:31.980, 00:00:36.160)] :  I'm going to hand off to my good friend Jeremy Thake, and he's going to walk you through the first half of this session. Thanks, Barno. I appreciate it. And thank you for coming to Build.\n", "[SPEAKER_01] : [(00:00:37.960, 00:00:40.280)] :  So I'm Jeremy Thake. I'm a\n", '[SPEAKER_01] : [(00:00:40.280, 00:00:44.100)] :  principal program manager in the Copilot developer experience team. And a slight\n']
["[SPEAKER_00] : [(00:00:00.000, 00:00:23.660)] :  All right, everybody. Welcome, 