In [1]:
import os
from openai import OpenAI
import base64
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
API_KEY = os.getenv("OPENAI_API_KEY_VOICE")
client = OpenAI(api_key=API_KEY)
audio_file_path = "data/caseoh burger king.mp3" 

In [3]:
def transcribe_audio(audio_path, model="gpt-4o-transcribe"):
    """
    Transcribe an audio file using OpenAI's GPT-4o audio transcription.
    
    Args:
        audio_path (str): Path to the audio file
        model (str): Model to use for transcription (default: gpt-4o-transcribe)
    
    Returns:
        dict: Transcription result containing the text and other metadata
    """
    try:
        # Check if file exists
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        
        # Open and transcribe the audio file
        with open(audio_path, "rb") as audio_file:
            transcription = client.audio.transcriptions.create(
                model=model,  
                file=audio_file            )
        
        return transcription
    
    except Exception as e:
        print(f"Error during transcription: {str(e)}")
        return None


In [4]:
# Transcribe the audio file
result = transcribe_audio(audio_file_path)
result.to_dict()

{'text': "I mean I drive through a Burger King. Can I please get a waffle junior with onion rings? Make it a meal so I can get a drink. No I'm not finished that's not everything. Can I please get a double waffle but no cheese? Can I please get a number two with a large drink? I got money so I don't care how much it cost me. So you throw in some extra fries don't make em salty. All this cheese gonna make my booty drip drip. I'm lactose intolerant I don't sip hook. If I see a side of cheese I'm a trip trip. I'mma sit on your toilet seat and doodle the nip. So you got my lil waffle junior I didn't even forget that. And you got my double waffle I didn't even forget that. What about my onion ring? I put all you to sit back. Burger King ain't know me now cheese I don't want that. It don't get in hungry now I know you heard that. Waiting for my onion ring so I don't have to turn back. Burger King don't play with me yeah this is so trash. Yeah I get it in revenicles but yeah you deserve that. 

In [5]:
source_input_filename = audio_file_path.split("/")[-1].split(".")[0]
if result:
    output_file = f"output/{source_input_filename}_transcription.txt"
    with open(output_file, "w") as f:
        f.write(result.text)
    print(f"Transcription saved to: {output_file}")


Transcription saved to: output/caseoh burger king_transcription.txt


In [13]:
def to_data_url(path: str) -> str:
    with open(path, "rb") as fh:
        return "data:audio/mp3;base64," + base64.b64encode(fh.read()).decode("utf-8")


def transcribe_audio_diarize(audio_path, model="gpt-4o-transcribe-diarize"):
    """
    Transcribe an audio file using OpenAI's GPT-4o audio transcription.
    
    Args:
        audio_path (str): Path to the audio file
        model (str): Model to use for transcription (default: gpt-4o-transcribe)
    
    Returns:
        dict: Transcription result containing the text and other metadata
    """
    try:
        # Check if file exists
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        
        # Open and transcribe the audio file
        with open(audio_path, "rb") as audio_file:
            transcription = client.audio.transcriptions.create(
                model=model,  
                file=audio_file,
                response_format="diarized_json",
                chunking_strategy="auto",
                extra_body={
                    "known_speaker_names": ["Vicky Hollub"],
                    "known_speaker_references": [to_data_url("data/vicky_reference.mp3")],
                }
                )
        
        return transcription
    
    except Exception as e:
        print(f"Error during transcription: {str(e)}")
        return None

In [14]:
audio_file_path_diarize = "data/cnbc oxychem sales.mp3"

In [15]:
# Transcribe the audio file
result = transcribe_audio_diarize(audio_file_path_diarize)
result.to_dict()

{'segments': [{'id': 'seg_0',
   'end': 0.19999999999999996,
   'speaker': 'A',
   'start': 0.0,
   'text': ' Well,',
   'type': 'transcript.text.segment'},
  {'id': 'seg_1',
   'end': 0.8499999999999999,
   'speaker': 'A',
   'start': 0.25,
   'text': " it's official.",
   'type': 'transcript.text.segment'},
  {'id': 'seg_2',
   'end': 3.95,
   'speaker': 'A',
   'start': 0.95,
   'text': " Berkshire Hathaway is acquiring Occidental's chemical business,",
   'type': 'transcript.text.segment'},
  {'id': 'seg_3',
   'end': 4.95,
   'speaker': 'A',
   'start': 4.2,
   'text': ' OxyChem.',
   'type': 'transcript.text.segment'},
  {'id': 'seg_4',
   'end': 9.150000000000002,
   'speaker': 'A',
   'start': 5.3500000000000005,
   'text': " It's an all-cash transaction. It's valued at $9.7 billion.",
   'type': 'transcript.text.segment'},
  {'id': 'seg_5',
   'end': 12.400000000000002,
   'speaker': 'A',
   'start': 9.450000000000003,
   'text': " And this is Berkshire's biggest purchase sinc

In [16]:
source_input_filename_diarize = audio_file_path_diarize.split("/")[-1].split(".")[0]

output_dict = result.to_dict() if result else None
if output_dict:
    output_json_file = f"output/{source_input_filename_diarize}_transcription_with_vicky_ref.json"
    with open(output_json_file, "w") as f:
        import json
        json.dump(output_dict, f, indent=2)
    print(f"Full diarized transcription (dict) saved to: {output_json_file}")


# if result:
#     output_file = f"output/{source_input_filename_diarize}_transcription.txt"
#     with open(output_file, "w") as f:
#         f.write(result.text)
#     print(f"Transcription saved to: {output_file}")

Full diarized transcription (dict) saved to: output/cnbc oxychem sales_transcription_with_vicky_ref.json
