In [None]:
%pip install -qU openai ffmpeg-python pyannote.audio python-dotenv

In [None]:
# Get openai API key from .env file
from dotenv import load_dotenv
import os
load_dotenv()

# Load OpenAI API key
openai_key = os.getenv("OPENAI_API_KEY")
# Get Hugging Face Token
HF_AUTH_TOKEN = os.getenv("HF_AUTH_TOKEN")

from openai import OpenAI

client = OpenAI(
  api_key=openai_key
)

In [None]:
# Import libraries
import ffmpeg
from pyannote.audio import Pipeline

# Function to create a 45-second short version of the file
def create_short_audio(input_path, output_path="short_version.wav", duration=120):
    try:
        (
            ffmpeg
            .input(input_path, t=duration)
            .output(output_path)
            .run(overwrite_output=True)
        )
        print(f"Short version created at {output_path}")
    except Exception as e:
        print("Error in creating short version:", e)

# Function for speaker diarization and transcription
def transcribe_with_speaker_labels(input_path):
    # Step 1: Speaker Diarization
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HF_AUTH_TOKEN)
    diarization = pipeline({"audio": input_path})
    
    # Step 2: Process each speaker's segment for transcription
    transcript = ""
    with open(input_path, "rb") as audio_file:
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            audio_file.seek(0)  # Reset audio file pointer for each segment
            segment_start = turn.start
            segment_end = turn.end

            # Extract the segment audio
            segment_output = f"segment_{segment_start:.2f}_{segment_end:.2f}.wav"
            (
                ffmpeg
                .input(input_path, ss=segment_start, to=segment_end)
                .output(segment_output)
                .run(overwrite_output=True)
            )

            # Transcribe segment with the latest Whisper endpoint
            with open(segment_output, "rb") as segment_file:
                try:
                    transcription = client.audio.transcriptions.create(
                        model="whisper-1", 
                        file=segment_file,
                        response_format="text"
                    )
                    segment_transcript = transcription
                except Exception as e:
                    print("Error in transcription:", e)
                    segment_transcript = ""

            # Append speaker labels
            transcript += f"**{speaker}**: {segment_transcript}\n\n"

            # delete the segment file
            os.remove(segment_output)

    return transcript

In [None]:
# Main execution
# Replace 'your_audio_file.wav' with the path to your file
input_audio_path = "../data/scaling_monosemanticity.wav"
short_audio_path = "../data/short_version.wav"


In [None]:

# Step 1: Create a 45-second short version
create_short_audio(input_audio_path, output_path=short_audio_path)


In [None]:
# Step 2: Transcribe with speaker labels
transcript = transcribe_with_speaker_labels(short_audio_path)
# print("Transcript with Speaker Labels:\n", transcript)


In [None]:
# write the transcript to a file
with open("../data/transcript.txt", "w") as f:
    f.write(transcript)

# Only run this if you understand you are going to spend $

In [None]:
# Transcribe with speaker labels the full file
transcript = transcribe_with_speaker_labels(input_audio_path)
# write the transcript to a file
with open("../data/transcript_full.txt", "w") as f:
    f.write(transcript)