In [23]:
from openai import OpenAI
client = OpenAI()

# file_path = "audio_raw/f8f3b092-3621-4e10-a800-c232e1d5f49e_3fab06a5-c6d5-47eb-a174-173817ad0abd.mp3"
# audio_file = open(file_path, "rb")
# transcription = client.audio.transcriptions.create(
#   file=audio_file,
#   model="gpt-4o-mini-transcribe",
#   language="es",
#   prompt="Actua como un radiologo que traduce audio de radiologia."
# )
# transcription.text


In [24]:
import os
from tqdm import tqdm

def transcribe_all_audios(input_folder="audio_raw", output_folder="transcriptions", language="es"):
    """
    Transcribe all MP3 files in the input_folder and save the transcriptions to output_folder
    
    Args:
        input_folder: Folder containing MP3 files
        output_folder: Folder to save transcription text files
        language: Language code for transcription
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get all MP3 files
    audio_files = [f for f in os.listdir(input_folder) if f.endswith('.mp3')]
    
    for filename in tqdm(audio_files, desc="Transcribing audios"):
        try:
            # Open audio file
            file_path = os.path.join(input_folder, filename)
            with open(file_path, "rb") as audio_file:
                # Transcribe audio
                transcription = client.audio.transcriptions.create(
                    file=audio_file,
                    model="gpt-4o-transcribe",
                    language=language,
                    prompt="Actua como un radiologo que traduce audio de radiologia."
                )
            
            # Save transcription to text file
            output_file = os.path.join(output_folder, filename.replace('.mp3', '.txt'))
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(transcription.text)
                
        except Exception as e:
            error_message = f"Error processing {filename}: {e}"
            print(error_message)
            
            # Create error folder if it doesn't exist
            error_folder = os.path.join(output_folder, "transcription_errors")
            if not os.path.exists(error_folder):
                os.makedirs(error_folder)
            
            # Save the error message to a file in the error folder
            error_file = os.path.join(error_folder, filename.replace('.mp3', '.txt'))
            with open(error_file, 'w', encoding='utf-8') as f:
                f.write(f"TRANSCRIPTION ERROR: {error_message}")


In [25]:
# Example usage
transcribe_all_audios()

Transcribing audios:   2%|▏         | 76/4284 [03:15<3:00:43,  2.58s/it]


KeyboardInterrupt: 