In [3]:
from pydub import AudioSegment
import os
import pvfalcon
import json
import whisper
from pyannote.audio import Pipeline

from dotenv import load_dotenv
load_dotenv()

AUDIO_PATH = os.getenv('AUDIO_PATH') # Path to audiofile you wish to transcribe/diarize
JSONS_PATH = os.getenv('JSONS_PATH') # Here we save the JSONS containing the synced diarizations and transcriptions
PICO_TOKEN = os.getenv('PICO_TOKEN') 
HF_TOKEN = os.getenv('HF_TOKEN')  
CONFIG = os.getenv('CONFIG')

def preprocess_audio(filepath):
    '''
    Ensures audiofile is of correct format for picavoice falcon.
    '''

    # Convert the audio file to 16 kHz mono WAV format
    audio_path = os.getcwd()+ "/tmp/" + "resampled_" + os.path.basename(filepath)
    if os.path.isfile(audio_path):
        os.remove(audio_path)

    # Export the audio to a temporary WAV file
    audio = AudioSegment.from_file(filepath)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio.export(audio_path, format="wav")
    return audio_path
    
def segment_score(transcript_segment, speaker_segment):
    """
    Calculate the overlap score between a transcription segment and a speaker segment.

    Args:
        transcript_segment (dict): A dictionary with 'start' and 'end' keys indicating the start and end times of the transcription segment.
        speaker_segment (dict): A dictionary with 'start' and 'end' keys indicating the start and end times of the speaker segment.

    Returns:
        float: The overlap ratio between the transcript segment and the speaker segment. The ratio is calculated as the overlap duration divided by the transcript segment duration.

    Admission of guilt: This method was found in another repo and I can't for the life of me find where I got it from. Sorrrryyy.
    """
    
    # Extract the start and end times for the transcription segment
    transcript_segment_start = transcript_segment["start"]
    transcript_segment_end = transcript_segment["end"]
    
    # Extract the start and end times for the speaker segment
    speaker_segment_start = speaker_segment["start"]
    speaker_segment_end = speaker_segment["end"]

    # Calculate the overlap duration between the two segments
    overlap = min(transcript_segment_end, speaker_segment_end) - max(transcript_segment_start, speaker_segment_start)
    
    # Calculate the duration of the transcript segment
    transcript_duration = transcript_segment_end - transcript_segment_start
    
    # Calculate the overlap ratio by dividing the overlap duration by the transcript segment duration
    overlap_ratio = overlap / transcript_duration
    
    return overlap_ratio

def add_text_to_segments(speaker_segments_json, whisper_result):
    """
    Add transcribed text from Whisper results to the corresponding speaker segments.

    Args:
        speaker_segments_json (dict): JSON object containing speaker segments.
        whisper_result (dict): JSON object containing Whisper transcription results.

    Returns:
        dict: Updated speaker segments with transcribed text added.
    """
    
    # Extract the list of speaker segments and transcript segments
    speaker_segments = speaker_segments_json['segments']
    transcript_segments = whisper_result['segments']

    # Iterate over each transcript segment
    for t_segment in transcript_segments:
        max_score = 0
        best_s_segment = None
        
        # Find the best matching speaker segment for the current transcript segment
        for s_segment in speaker_segments:
            score = segment_score(t_segment, s_segment)
            if score > max_score:
                max_score = score
                best_s_segment = s_segment

        # Add the transcribed text to the best matching speaker segment
        if best_s_segment:
            if 'text' in best_s_segment:
                best_s_segment['text'] += ' ' + t_segment['text']
            else:
                best_s_segment['text'] = t_segment['text']

    # Return the updated speaker segments
    return {'segments': speaker_segments}

def json_add_text(json_path, whisper_transcription):
    """
    Add transcribed text to a JSON file of speaker segments and save the updated JSON.

    Args:
        json_path (str): Path to the JSON file containing speaker segments.

    Returns:
        None
    """
    
    # Open and load the JSON file containing speaker segments
    with open(json_path, "r") as file:
        speaker_segments_json = json.load(file)

    # Add transcribed text to the speaker segments using the provided Whisper transcription
    updated_segments_json = add_text_to_segments(speaker_segments_json, whisper_transcription)
    
    # Save the updated speaker segments to a new JSON file
    with open(json_path, "w") as file:
        json.dump(updated_segments_json, file, indent=4)

    # Inform the user that the updated JSON file was created successfully
    print("Updated JSON file created successfully.")


## Transcribe using whisper

In [4]:
# Load the Whisper model for transcription
whisper_model = whisper.load_model(CONFIG) 

In [5]:
# Transcribe audio clip using whisper
whisper_transcription = whisper_model.transcribe(AUDIO_PATH,
                                                temperature = 0.2,
                                                beam_size = 10,
                                                best_of = 2,
                                                no_speech_threshold = 0.3,
                                                initial_prompt='A training sales call. Michael, the boss, oversees Dwights practice call to his colleague Jim who uses the pseudonym “William (Bill) M. Buttlicker”. ')

## Diarize using falcon

In [6]:
falcon_model = pvfalcon.create(access_key=PICO_TOKEN)  # Creating Falcon model with Picovoice token

In [7]:
# Preprocess the audio file to ensure correct sampling rate and format
tmp_audio_path = preprocess_audio(AUDIO_PATH)

# Perform diarization on the preprocessed audio file
falcon_diarization = falcon_model.process_file(tmp_audio_path)

# Remove the temporary audio file after processing
os.remove(tmp_audio_path)

# Extract diarization segments and format them into a dictionary
falcon_segments = [
    {
        "start": segment.start_sec,
        "end": segment.end_sec,
        "speaker": f"SPEAKER_{str(segment.speaker_tag).zfill(2)}"
    }
    for segment in falcon_diarization
]


# THE FOLLOWING CODE IS IDENTICAL FOR PYANNOTE AND FALCON
# Create a JSON object with the extracted segments
falcon_json = {"segments": falcon_segments}

# Define the path to save the JSON file
falcon_json_path = os.path.join(JSONS_PATH, "falcon_diarization.json")

# Save the JSON object to a file
with open(falcon_json_path, "w") as json_file:
    json.dump(falcon_json, json_file, indent=4)

# Sync the transcription and the diarization. Updates the json
json_add_text(falcon_json_path, whisper_transcription)

print(f"Falcon diarization JSON saved to {falcon_json_path}")

Updated JSON file created successfully.
Falcon diarization JSON saved to /home/filbern/P3/JSONS/falcon_diarization.json


## Diarize using pyannote

In [8]:
## Load model
pyannote_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN)  # Loading Pyannote speaker diarization model

torchvision is not available - cannot save figures


In [7]:
# Diarize using the pipeline. Specify the number of speakers
num_speakers = 3
pyannote_diarization = pyannote_pipeline(AUDIO_PATH, num_speakers = num_speakers)

# Extract diarization segments and format them into a dictionary
pyannote_segments = [
    {
        "start": turn.start,
        "end": turn.end,
        "speaker": label
    }
    for turn, _, label in pyannote_diarization.itertracks(yield_label=True)
]


# THE FOLLOWING CODE IS IDENTICAL FOR PYANNOTE AND FALCON
# Create a JSON object with the extracted segments
pyannote_json = {"segments": pyannote_segments}

# Define the path to save the JSON file
pyannote_json_path = os.path.join(JSONS_PATH, "pyannote_diarization.json")

# Save the JSON object to a file
with open(pyannote_json_path, "w") as pyannote_file:
    json.dump(pyannote_json, pyannote_file, indent=4)

# Sync the transcription and the diarization. Updates the json
json_add_text(pyannote_json_path, whisper_transcription)    

print(f"Pyannote diarization JSON saved to {pyannote_json_path}")

Updated JSON file created successfully.
Pyannote diarization JSON saved to /home/filbern/P3/JSONS/falcon_diarization.json
