In [None]:
# Celda 1: Instalación de dependencias
# %%capture
#!pip install -U -q google-genai gradio transformers torch torchvision accelerate -q


In [None]:
# Celda 2: Configuración de API Key
from google.colab import userdata
import os

In [None]:
# Obtener la API key de Google (debes configurarla en Colab Secrets)
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

if not GOOGLE_API_KEY:
    print("❌ ERROR: Necesitas configurar tu GOOGLE_API_KEY en los secretos de Colab")
    print("📋 Ve a: Configuración (🔧) > Secretos > Agregar nuevo secreto")
    print("🔑 Nombre: GOOGLE_API_KEY")
    print("💡 Valor: Tu API key de Google AI Studio")
else:
    print("✅ API Key de Google configurada correctamente")


✅ API Key de Google configurada correctamente


In [None]:
# Celda 3: Imports y configuración
import torch
from transformers import pipeline
import gradio as gr
from google import genai
from google.genai import types
import logging

# Configurar logging
logging.basicConfig(level=logging.INFO)

# Configurar cliente de Gemini
cliente = genai.Client(api_key=GOOGLE_API_KEY)

In [None]:
ASR_MODEL = "openai/whisper-large-v3" # @param ["openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large-v3"] {allow-input: true}
GEMINI_MODEL = "gemini-2.0-flash" # @param ["gemini-2.0-flash-lite","gemini-2.0-flash","gemini-2.5-flash-preview-05-20","gemini-2.5-pro-preview-05-06"] {"allow-input":true, isTemplate: true}

print(f"🤖 Modelo ASR: {ASR_MODEL}")
print(f"🧠 Modelo LLM: {GEMINI_MODEL}")

🤖 Modelo ASR: openai/whisper-large-v3
🧠 Modelo LLM: gemini-2.0-flash


In [None]:
# Celda 4: Configuración de ASR
device = 0 if torch.cuda.is_available() else -1
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

print(f"💻 Dispositivo: {'GPU' if device == 0 else 'CPU'}")
print(f"📊 Tipo de datos: {torch_dtype}")

try:
    print("🔄 Cargando modelo Whisper...")
    asr_pipe = pipeline(
        "automatic-speech-recognition",
        model=ASR_MODEL,
        device=device,
        torch_dtype=torch_dtype,
        return_timestamps=True,
        chunk_length_s=30,
        stride_length_s=(4, 2)
    )
    print("✅ Modelo Whisper cargado exitosamente")

except Exception as e:
    print(f"❌ Error cargando ASR: {e}")
    raise e

💻 Dispositivo: GPU
📊 Tipo de datos: torch.bfloat16
🔄 Cargando modelo Whisper...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Device set to use cuda:0


✅ Modelo Whisper cargado exitosamente


In [None]:
# Celda 6: Función principal de procesamiento
def process_audio_with_gemini(audio_file_path, tipo_resumen, contexto):
    """
    Procesa audio: transcribe con Whisper y sumariza con Gemini
    """
    if not audio_file_path:
        return "❌ No se ha cargado ningún archivo de audio", "❌ Sin audio para procesar"

    try:
        # Validaciones básicas
        if not os.path.exists(audio_file_path):
            return "❌ El archivo no existe", "❌ Archivo no encontrado"

        file_size = os.path.getsize(audio_file_path)
        if file_size < 1024:  # Menos de 1KB
            return "❌ El archivo es muy pequeño", "❌ Archivo muy pequeño"

        print(f"📁 Procesando: {audio_file_path} ({file_size} bytes)")

        # PASO 1: Transcripción con Whisper
        print("🎯 Iniciando transcripción con Whisper...")
        try:
            result = asr_pipe(audio_file_path)
            print(f"📝 Resultado ASR: {type(result)}")
        except Exception as asr_error:
            print(f"❌ Error en transcripción: {asr_error}")
            return f"❌ Error en transcripción: {str(asr_error)}", "❌ Error en transcripción"

        # Extraer texto de la transcripción
        raw_transcript = ""

        if isinstance(result, dict):
            if "text" in result:
                raw_transcript = result["text"].strip()
            elif "chunks" in result:
                chunks = result["chunks"]
                raw_transcript = " ".join([chunk["text"].strip() for chunk in chunks if "text" in chunk])
        elif isinstance(result, str):
            raw_transcript = result.strip()
        elif isinstance(result, list) and len(result) > 0:
            if isinstance(result[0], dict) and "text" in result[0]:
                raw_transcript = " ".join([chunk["text"].strip() for chunk in result if "text" in chunk])

        print(f"📄 Transcripción obtenida: {len(raw_transcript)} caracteres")

        if not raw_transcript:
            return "❌ No se pudo obtener transcripción del audio", "❌ Transcripción vacía"

        if len(raw_transcript) < 50:
            return raw_transcript, "📝 El texto es muy corto para un resumen útil"

        # PASO 2: Sumarización con Gemini
        print("🧠 Generando resumen con Gemini...")
        summary = sumarizar_con_gemini(raw_transcript, tipo_resumen, contexto)

        # Agregar metadata al resumen
        metadata = f"📊 **Resumen generado con Gemini {GEMINI_MODEL}**\n"
        metadata += f"🎯 Tipo: {tipo_resumen} | 📋 Contexto: {contexto.title()}\n\n"
        summary_with_metadata = metadata + summary

        return raw_transcript, summary_with_metadata

    except Exception as e:
        error_msg = f"❌ Error durante el procesamiento: {str(e)}"
        print(error_msg)
        return error_msg, error_msg

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
input_folder_path = "/content/drive/My Drive/nlp/audios a transcribir" # @param {type:"string"}
output_folder_path = "/content/drive/My Drive/nlp/txt para rag" # @param {type:"string"}

print(f"Input folder: {input_folder_path}")
print(f"Output folder: {output_folder_path}")

Input folder: /content/drive/My Drive/nlp/audios a transcribir
Output folder: /content/drive/My Drive/nlp/txt para rag


In [None]:
import os

# Create output folder if it doesn't exist
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    print(f"✅ Output folder created: {output_folder_path}")
else:
    print(f"✅ Output folder already exists: {output_folder_path}")

# Check if input folder exists (user needs to create it manually or place files there)
if not os.path.exists(input_folder_path):
    print(f"❌ Error: The input folder does not exist: {input_folder_path}")
    print("Please create this folder in your Google Drive and place your MP3 files inside.")
else:
    print(f"✅ Input folder exists: {input_folder_path}")

mp3_files = []

try:
    # Iterate through files in the input folder
    for filename in os.listdir(input_folder_path):
        if filename.lower().endswith('.mp3'):
            full_path = os.path.join(input_folder_path, filename)
            mp3_files.append(full_path)

    if mp3_files:
        print(f"✅ Found {len(mp3_files)} MP3 files in '{input_folder_path}':")
        for f in mp3_files:
            print(f"- {f}")
    else:
        print(f"⚠️ No MP3 files found in '{input_folder_path}'. Please ensure your audio files are in .mp3 format.")
except Exception as e:
    print(f"❌ An error occurred while listing MP3 files: {e}")

✅ Output folder created: /content/drive/My Drive/nlp/txt para rag
❌ Error: The input folder does not exist: /content/drive/My Drive/nlp/audios a transcribir
Please create this folder in your Google Drive and place your MP3 files inside.
❌ An error occurred while listing MP3 files: [Errno 2] No such file or directory: '/content/drive/My Drive/nlp/audios a transcribir'


In [None]:
def process_audio_with_gemini(audio_file_path, output_folder_path):
    """
    Procesa audio: transcribe con Whisper y guarda la transcripción como .txt.
    Se elimina la sumarización con Gemini para mejorar la performance y enfocarse solo en la transcripción.
    """
    if not audio_file_path:
        return "❌ No se ha cargado ningún archivo de audio", ""

    try:
        # Validaciones básicas
        if not os.path.exists(audio_file_path):
            return "❌ El archivo no existe", ""

        file_size = os.path.getsize(audio_file_path)
        if file_size < 1024:  # Menos de 1KB
            return "❌ El archivo es muy pequeño", ""

        print(f"📁 Procesando: {audio_file_path} ({file_size} bytes)")

        # PASO 1: Transcripción con Whisper
        print("🎯 Iniciando transcripción con Whisper...")
        try:
            result = asr_pipe(audio_file_path)
            print(f"📝 Resultado ASR: {type(result)}")
        except Exception as asr_error:
            print(f"❌ Error en transcripción: {asr_error}")
            return f"❌ Error en transcripción: {str(asr_error)}", ""

        # Extraer texto de la transcripción
        raw_transcript = ""

        if isinstance(result, dict):
            if "text" in result:
                raw_transcript = result["text"].strip()
            elif "chunks" in result:
                chunks = result["chunks"]
                raw_transcript = " ".join([chunk["text"].strip() for chunk in chunks if "text" in chunk])
        elif isinstance(result, str):
            raw_transcript = result.strip()
        elif isinstance(result, list) and len(result) > 0:
            if isinstance(result[0], dict) and "text" in result[0]:
                raw_transcript = " ".join([chunk["text"].strip() for chunk in result if "text" in chunk])

        print(f"📄 Transcripción obtenida: {len(raw_transcript)} caracteres")

        if not raw_transcript:
            return "❌ No se pudo obtener transcripción del audio", ""

        # PASO 2: Guardar transcripción en el output_folder_path
        base_filename = os.path.splitext(os.path.basename(audio_file_path))[0]
        transcript_output_path = os.path.join(output_folder_path, f"{base_filename}.txt")

        try:
            with open(transcript_output_path, "w", encoding="utf-8") as f_transcript:
                f_transcript.write(raw_transcript)
            print(f"✅ Transcripción guardada en: {transcript_output_path}")
        except Exception as file_error:
            print(f"❌ Error al guardar la transcripción en {transcript_output_path}: {file_error}")

        # Se devuelve la transcripción y una cadena vacía en lugar del resumen
        return raw_transcript, ""

    except Exception as e:
        error_msg = f"❌ Error durante el procesamiento: {str(e)}"
        print(error_msg)
        return error_msg, ""


In [None]:
print(f"Iniciando procesamiento por lotes con solo transcripción.")

if not mp3_files:
    print("⚠️ No hay archivos MP3 para procesar en lote.")
else:
    print(f"Iniciando procesamiento de {len(mp3_files)} archivos MP3...")
    for i, audio_file_path in enumerate(mp3_files):
        print(f"\n--- Procesando archivo {i+1}/{len(mp3_files)}: {os.path.basename(audio_file_path)} ---")
        try:
            # Se llama a la función modificada sin los parámetros de resumen y contexto
            raw_transcript, _ = process_audio_with_gemini(
                audio_file_path,
                output_folder_path
            )
            # Comprobamos si hay un error en la transcripción. El segundo valor (_) será vacío ahora.
            if "❌" in raw_transcript:
                print(f"🚨 Error procesando {os.path.basename(audio_file_path)}: {raw_transcript}")
            else:
                print(f"✅ Procesamiento completado para {os.path.basename(audio_file_path)}.")
        except Exception as e:
            print(f"❌ Error crítico al procesar {os.path.basename(audio_file_path)}: {e}")

print("\n--- Procesamiento por lotes finalizado ---")


Iniciando procesamiento por lotes con solo transcripción.
Iniciando procesamiento de 3 archivos MP3...

--- Procesando archivo 1/3: audiosjaviermilei127462127479milei-freecubanpodcasts-ivoox92479733.mp3 ---
📁 Procesando: /content/drive/My Drive/nlp/audios a transcribir/audiosjaviermilei127462127479milei-freecubanpodcasts-ivoox92479733.mp3 (2748409 bytes)
🎯 Iniciando transcripción con Whisper...


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


📝 Resultado ASR: <class 'dict'>
📄 Transcripción obtenida: 5271 caracteres
✅ Transcripción guardada en: /content/drive/My Drive/nlp/transcripciones/audiosjaviermilei127462127479milei-freecubanpodcasts-ivoox92479733.txt
✅ Procesamiento completado para audiosjaviermilei127462127479milei-freecubanpodcasts-ivoox92479733.mp3.

--- Procesando archivo 2/3: audiosjaviermileipingpongconjavier-freecubangonzalezroman-ivoox75249217.mp3 ---
📁 Procesando: /content/drive/My Drive/nlp/audios a transcribir/audiosjaviermileipingpongconjavier-freecubangonzalezroman-ivoox75249217.mp3 (3653059 bytes)
🎯 Iniciando transcripción con Whisper...
📝 Resultado ASR: <class 'dict'>
📄 Transcripción obtenida: 6576 caracteres
✅ Transcripción guardada en: /content/drive/My Drive/nlp/transcripciones/audiosjaviermileipingpongconjavier-freecubangonzalezroman-ivoox75249217.txt
✅ Procesamiento completado para audiosjaviermileipingpongconjavier-freecubangonzalezroman-ivoox75249217.mp3.

--- Procesando archivo 3/3: audiosjavier

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


📝 Resultado ASR: <class 'dict'>
📄 Transcripción obtenida: 7979 caracteres
✅ Transcripción guardada en: /content/drive/My Drive/nlp/transcripciones/audiosjaviermileienfrentamientojavie-freecubangonzalezroman-ivoox74747583.txt
✅ Procesamiento completado para audiosjaviermileienfrentamientojavie-freecubangonzalezroman-ivoox74747583.mp3.

--- Procesamiento por lotes finalizado ---
