In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY_NON_AZURE"))


In [None]:
import ffmpeg
import math

input_audio = 'audiofile.m4a'
chunk_duration_sec = 300  # 5 minutes per chunk.
# Shorter chunks (e.g., 5–10 minutes) help the transcription model clearly recognize context. Long chunks can lose context and cause the model to misinterpret segments.
# OpenAI charges for audio transcription for the length of audio processed, not per number of API calls. 
# Splitting the same audio into shorter segments does not affect total cost since the total length of audio remains constant.
# Shorter chunks improve accuracy without increasing your total transcription cost.
output_dir = 'chunks'
os.makedirs(output_dir, exist_ok=True)

# Create directory for chunks
os.makedirs('chunks', exist_ok=True)

# Get total duration using ffmpeg
def get_audio_duration(filename):
    probe = ffmpeg.probe(filename)
    return float(probe['format']['duration'])

audio_length = get_audio_duration(input_audio)
num_chunks = math.ceil(audio_length / chunk_duration_sec)

transcriptions = []

# Replace 'en' with your language code ('tr' for Turkish, 'fr' for French, etc.)
# Language detection usually works well, but better to specify it manually for increased accuracy
target_language = 'tr'

for i in range(num_chunks):
    start_time = i * chunk_duration_sec
    chunk_filename = os.path.join(output_dir, f'chunk_{i+1}.mp3')
    
     # Remove silence at start of each chunk
    ffmpeg.input(input_audio, ss=start_time, t=chunk_duration_sec)\
        .filter('silenceremove', start_periods=1, start_duration=0.5, start_threshold='-50dB')\
        .output(chunk_filename, audio_bitrate='128k')\
        .run(overwrite_output=True)

    print(f'Transcribing chunk {i+1}/{num_chunks}...')

    with open(chunk_filename, 'rb') as audio_file:
        transcription = client.audio.transcriptions.create(
            model='gpt-4o-transcribe',  # replace with 'gpt-4o-transcribe' if supported
            file=audio_file,
            language=target_language  # explicitly specify language
        )

    transcriptions.append(transcription.text)

# Save combined transcription
output_transcription_file = 'full_transcription.txt'
with open(output_transcription_file, 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(transcriptions))

print(f'Full transcription saved to {output_transcription_file}')