In [25]:
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
import os
import librosa
import numpy as np
import pandas as pd
import random




In [26]:
DATASET_PATH = "UrbanSound8K"

metadata_path = os.path.join(DATASET_PATH, "metadata", "UrbanSound8K.csv")
metadata = pd.read_csv(metadata_path)

print(metadata.head())


      slice_file_name    fsID  start        end  salience  fold  classID  \
0    100032-3-0-0.wav  100032    0.0   0.317551         1     5        3   
1  100263-2-0-117.wav  100263   58.5  62.500000         1     5        2   
2  100263-2-0-121.wav  100263   60.5  64.500000         1     5        2   
3  100263-2-0-126.wav  100263   63.0  67.000000         1     5        2   
4  100263-2-0-137.wav  100263   68.5  72.500000         1     5        2   

              class  
0          dog_bark  
1  children_playing  
2  children_playing  
3  children_playing  
4  children_playing  


In [27]:

class Clip:
    """
    Objeto que representa un clip individual del UrbanSound8K.
    Emula el estilo de soundata:

    - clip.audio -> (y, sr)
    - clip.class_label
    - clip.class_id
    - clip.fold
    - clip.file_path
    """
    def __init__(self, clip_id, audio, sr, class_label, class_id, fold, file_path):
        self.clip_id = clip_id
        self.audio = (audio, sr)
        self.class_label = class_label
        self.class_id = class_id
        self.fold = int(fold)
        self.file_path = file_path

    def __repr__(self):
        return (
            f"Clip(id={self.clip_id}, class='{self.class_label}', "
            f"class_id={self.class_id}, fold={self.fold})"
        )


class UrbanSound8KDataset:
    """
    Dataset wrapper para UrbanSound8K que imita el comportamiento de soundata.

    Atributos principales:
    - metadata: DataFrame con todo el CSV
    - clip_ids: lista de IDs de clip (√≠ndices del DataFrame)
    - folds: dict {fold: DataFrame filtrada}

    M√©todos principales:
    - get_clip(clip_id) -> Clip
    - choice_clip() -> Clip aleatorio
    """

    def __init__(self, base_path="UrbanSound8K", sr=22050):
        self.base_path = base_path
        self.sr = sr

        # Cargar metadata
        metadata_path = os.path.join(base_path, "metadata", "UrbanSound8K.csv")
        if not os.path.exists(metadata_path):
            raise FileNotFoundError(
                f"No se encontr√≥ el archivo de metadata en: {metadata_path}"
            )

        self.metadata = pd.read_csv(metadata_path)

        # Limpiar posibles espacios en nombres de columnas
        self.metadata.columns = self.metadata.columns.str.strip()

        # IDs de clips (usamos el √≠ndice del DataFrame como ID)
        self.clip_ids = list(self.metadata.index)

        # Precomputar folds 1..10
        self.folds = {
            fold: self.metadata[self.metadata["fold"] == fold]
            for fold in range(1, 11)
        }

    def get_clip(self, clip_id):
        """
        Devuelve un objeto Clip, cargando el audio desde disco.

        clip_id debe ser un √≠ndice v√°lido de self.metadata (0..len-1).
        """
        if clip_id not in self.clip_ids:
            raise ValueError(f"clip_id {clip_id} no es v√°lido.")

        row = self.metadata.loc[clip_id]

        fold = int(row["fold"])
        filename = row["slice_file_name"]
        class_label = row["class"]
        class_id = int(row["classID"])

        audio_path = os.path.join(self.base_path, "audio", f"fold{fold}", filename)

        if not os.path.exists(audio_path):
            raise FileNotFoundError(
                f"No se encontr√≥ el archivo de audio en: {audio_path}"
            )

        # Cargar audio como mono, remuestreado a self.sr
        audio, sr = librosa.load(audio_path, sr=self.sr, mono=True)

        return Clip(
            clip_id=clip_id,
            audio=audio,
            sr=sr,
            class_label=class_label,
            class_id=class_id,
            fold=fold,
            file_path=audio_path,
        )

    def choice_clip(self):
        """Devuelve un Clip aleatorio del dataset."""
        cid = random.choice(self.clip_ids)
        return self.get_clip(cid)

    def __len__(self):
        return len(self.clip_ids)

    @property
    def num_clips(self):
        return len(self.clip_ids)

    def __repr__(self):
        return f"UrbanSound8KDataset(num_clips={len(self)}, folds=10)"


In [None]:
SR = 22050
DURATION = 4.0
N_MFCC = 40


dataset = UrbanSound8KDataset("UrbanSound8K", sr=SR)
example_clip = dataset.choice_clip()

def load_clip_mfcc(clip):
    y, sr = clip.audio

    # Resample
    if sr != SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=SR)

    # Asegurar duraci√≥n fija
    max_len = int(SR * DURATION)
    if len(y) < max_len:
        y = np.pad(y, (0, max_len - len(y)))
    else:
        y = y[:max_len]

    # Calcular MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=SR, n_mfcc=N_MFCC)
    return mfcc  # shape: (40, T)


In [29]:
def load_data_for_fold(dataset, test_fold):
    X_train, y_train = [], []
    X_test, y_test = [], []

    for cid in dataset.clip_ids:
        clip = dataset.get_clip(cid)
        mfcc = load_clip_mfcc(clip)
        label = clip.class_id
        fold = clip.fold

        if fold == test_fold:
            X_test.append(mfcc)
            y_test.append(label)
        else:
            X_train.append(mfcc)
            y_train.append(label)

    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)


In [30]:
def prepare_lstm_features(X):
    X_out = []
    for mfcc in X:
        X_out.append(mfcc.T)  # (40, T) ‚Üí (T, 40)
    return np.array(X_out)


In [31]:
def build_lstm_model(n_timesteps, n_features):
    model = models.Sequential([
        layers.Input(shape=(n_timesteps, n_features)),
        layers.LSTM(64, return_sequences=True),
        layers.LSTM(32),
        layers.Dense(32, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


In [32]:
accuracies = []

print("\n===================================================")
print("üîµ INICIANDO 10-FOLD CROSS VALIDATION (LSTM)")
print("===================================================\n")

for fold in range(1, 11):

    print(f"\n===================================================")
    print(f"üöÄ Ejecutando Fold {fold} ...")
    print("===================================================\n")

    # 1. Cargar datos del fold
    X_train_raw, y_train, X_test_raw, y_test = load_data_for_fold(dataset, test_fold=fold)

    print(f"Fold {fold}: Train={len(X_train_raw)}, Test={len(X_test_raw)}")
    print(f"Shapes MFCC: train={X_train_raw.shape}, test={X_test_raw.shape}")

    # 2. Preparar MFCC ‚Üí LSTM (T, 40)
    X_train_lstm = prepare_lstm_features(X_train_raw)
    X_test_lstm  = prepare_lstm_features(X_test_raw)

    n_timesteps = X_train_lstm.shape[1]
    n_features  = X_train_lstm.shape[2]

    print(f"Shapes LSTM: train={X_train_lstm.shape}, test={X_test_lstm.shape}")

    # 3. Crear modelo LSTM
    lstm_model = build_lstm_model(n_timesteps, n_features)

    # 4. Entrenar
    history = lstm_model.fit(
        X_train_lstm, y_train,
        epochs=20,
        batch_size=32,
        validation_data=(X_test_lstm, y_test),
        verbose=1
    )

    # 5. Evaluar
    test_loss, test_acc = lstm_model.evaluate(X_test_lstm, y_test, verbose=0)
    accuracies.append(test_acc)

    print(f"\nüéØ Accuracy del Fold {fold}: {test_acc:.4f}")
    print("---------------------------------------------------")

# ---------------------------------------------------
# RESULTADOS FINALES
# ---------------------------------------------------
import numpy as np

print("\n===================================================")
print("üîµ RESULTADOS DEL 10-FOLD CROSS VALIDATION (LSTM)")
print("===================================================\n")

print("Accuracies por fold:", accuracies)

mean_acc = np.mean(accuracies)
std_acc  = np.std(accuracies)

print(f"\nüìå Accuracy promedio:      {mean_acc:.4f}")
print(f"üìå Desviaci√≥n est√°ndar:    {std_acc:.4f}")

print("\n===================================================")
print("üèÅ ENTRENAMIENTO COMPLETO")
print("===================================================")



üîµ INICIANDO 10-FOLD CROSS VALIDATION (LSTM)


üöÄ Ejecutando Fold 1 ...



Fold 1: Train=7859, Test=873
Shapes MFCC: train=(7859, 40, 173), test=(873, 40, 173)
Shapes LSTM: train=(7859, 173, 40), test=(873, 173, 40)
Epoch 1/20
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m24s[0m 83ms/step - accuracy: 0.4190 - loss: 1.7328 - val_accuracy: 0.3562 - val_loss: 1.8499
Epoch 2/20
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m25s[0m 103ms/step - accuracy: 0.5844 - loss: 1.2619 - val_accuracy: 0.4536 - val_loss: 1.8154
Epoch 3/20
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m39s[0m 95ms/step - accuracy: 0.6380 - loss: 1.1101 - val_accuracy: 0.4777 - val_loss: 1.6882
Epoch 4/20
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m26s[0m 108ms/step - accuracy: 0.6913 - loss: 0.9550 - val_accuracy: 0.4662 - val_loss: 1.8319
Epoch 5/20
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ

KeyboardInterrupt: 