In [1]:
import os

# Fix env windows
os.environ["HF_HUB_DISABLE_SYMLINKS"] = "1"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["SPEECHBRAIN_CACHE_STRATEGY"] = "copy"

from dotenv import load_dotenv
from pydub import AudioSegment
from pyannote.audio import Pipeline

import ipywidgets as widgets
widgets.IntSlider()

import warnings
warnings.filterwarnings(
    "ignore",
    message=".*deprecated.*"
)

# Load environment variables from .env
load_dotenv()

# Check token Hugging Face
HF_TOKEN = os.getenv("PYANNOTE_HF_TOKEN")
assert HF_TOKEN, "PYANNOTE_HF_TOKEN not defined."

  torchaudio.set_audio_backend("soundfile")


In [11]:
INPUT_DIR = "../audios"
OUTPUT_DIR = "./output"
PRE_CUT_SECONDS = 1.5  # segundos antes de que empiece el niño
MIN_CHILD_SEGMENT = 1.0  # duración mínima para considerar "voz del niño"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
pipeline = Pipeline.from_pretrained(
   'pyannote/speaker-diarization-3.0',
   use_auth_token=HF_TOKEN
)

  from torchaudio.backend.common import AudioMetaData


In [9]:
# ==========================
# FUNCIÓN PRINCIPAL
# ==========================
def process_audio(audio_path):
    print(f"Procesando: {os.path.basename(audio_path)}")

    # Cargar audio
    audio = AudioSegment.from_wav(audio_path)
    duration = len(audio) / 1000  # segundos

    # Diarización
    diarization = pipeline(audio_path)

    # Extraer segmentos ordenados
    segments = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        segments.append({
            "speaker": speaker,
            "start": turn.start,
            "end": turn.end,
            "duration": turn.end - turn.start
        })

    segments.sort(key=lambda x: x["start"])

    if len(segments) < 2:
        print("⚠️ No se detectaron suficientes hablantes")
        return None

    # ==========================
    # ASUMIMOS:
    # - Primer hablante = profesor
    # - Segundo hablante largo = niño
    # ==========================
    first_speaker = segments[0]["speaker"]

    child_start = None
    for seg in segments:
        if seg["speaker"] != first_speaker and seg["duration"] >= MIN_CHILD_SEGMENT:
            child_start = seg["start"]
            break

    if child_start is None:
        print("⚠️ No se detectó inicio del niño")
        return None

    # Aplicar margen
    cut_time = max(0, child_start - PRE_CUT_SECONDS)

    # Cortar audio
    cut_audio = audio[int(cut_time * 1000):]

    return cut_audio, cut_time

In [None]:
# ==========================
# PROCESAR CARPETA
# ==========================
for file in os.listdir(INPUT_DIR):
    if not file.lower().endswith(".wav"):
        continue

    input_path = os.path.join(INPUT_DIR, file)
    output_path = os.path.join(
        OUTPUT_DIR,
        file.replace(".wav", "_cut.wav")
    )

    result = process_audio(input_path)

    if result is None:
        continue

    cut_audio, cut_time = result
    cut_audio.export(output_path, format="wav")

    print(f"✔ Guardado: {output_path} (corte en {cut_time:.2f}s)")


Procesando: TA10005.wav
⚠️ No se detectó inicio del niño
Procesando: TA10014.wav
⚠️ No se detectó inicio del niño
Procesando: TA10033.wav
✔ Guardado: ./output\TA10033_cut.wav (corte en 9.05s)
Procesando: TA40171.wav
⚠️ No se detectó inicio del niño
Procesando: TA40173.wav
✔ Guardado: ./output\TA40173_cut.wav (corte en 5.42s)
Procesando: TA40176.wav
