# Audio Transcriptions 

The goal of this notebook is to transcribe the audio files of the podcass using a model.

In [2]:
import os
from dotenv import load_dotenv

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import numpy as np

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')


In [13]:
class WhisperTranscriber:
  def __init__(self, model_name="openai/whisper-large-v3"):
    self.processor = WhisperProcessor.from_pretrained(model_name)
    self.model = WhisperForConditionalGeneration.from_pretrained(model_name)
    
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.model.to(self.device)
    
  def run(self, file_path, language="es"):
    try:
      audio, sample_rate = librosa.load(file_path, sr=16000)
      
      chunk_length = 30 * 16000
      chunks = [audio[i:i+chunk_length] for i in range(0, len(audio), chunk_length)]
      
      transcriptions = []
      
      for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}")
        
        inputs = self.processor(
          chunk, 
          sampling_rate=16000, 
          return_tensors="pt",
          return_attention_mask=True
        )
        audio_input = inputs.input_features.to(self.device)
        attention_mask = inputs.attention_mask.to(self.device)
        
        forced_decoder_ids = self.processor.get_decoder_prompt_ids(
          language=language, 
          task="transcribe"
        )
        
        with torch.no_grad():
          predicted_ids = self.model.generate(
            audio_input,
            attention_mask=attention_mask,
            forced_decoder_ids=forced_decoder_ids,
            max_length=448,
            num_beams=1,
            temperature=0.0
          )
        
        chunk_transcription = self.processor.batch_decode(
          predicted_ids, 
          skip_special_tokens=True
        )[0]
        
        transcriptions.append(chunk_transcription.strip())
      
      full_transcription = " ".join(transcriptions)
      return full_transcription
      
    except Exception as e:
      return f"Error during transcription: {str(e)}"

In [15]:

transcriptor = WhisperTranscriber()
transcription_dir = "../transcription"
os.makedirs(transcription_dir, exist_ok=True)

mp3_file = "../audios/2015_08_02_20.mp3"
nombre_archivo = os.path.basename(mp3_file)
file_transcription_name = nombre_archivo.replace('.mp3', '.txt')


if not os.path.exists(mp3_file):
    print(f"❌ Archivo no encontrado: {mp3_file}")
    exit()

print("🎵 Iniciando transcripción...")
print(f"📁 Archivo: {mp3_file}")

result = transcriptor.run(mp3_file)


# Crear archivo de transcripción
transcripcion_file = os.path.join(transcription_dir, f"{file_transcription_name}")

# Guardar transcripción en archivo
with open(transcripcion_file, 'w', encoding='utf-8') as f:
    f.write(result)

print(f"✅ Transcripción guardada en: {transcripcion_file}")
print(f"📝 Contenido de la transcripción:")
print("-" * 50)
print(result)
print("-" * 50)


🎵 Iniciando transcripción...
📁 Archivo: ../audios/2015_08_02_20.mp3
Processing chunk 1/120
Processing chunk 2/120
Processing chunk 3/120
Processing chunk 4/120
Processing chunk 5/120
Processing chunk 6/120
Processing chunk 7/120
Processing chunk 8/120
Processing chunk 9/120
Processing chunk 10/120
Processing chunk 11/120
Processing chunk 12/120


KeyboardInterrupt: 