In [26]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
import random



In [27]:
DATASET_PATH = "UrbanSound8K"

metadata_path = os.path.join(DATASET_PATH, "metadata", "UrbanSound8K.csv")
metadata = pd.read_csv(metadata_path)

print(metadata.head())

      slice_file_name    fsID  start        end  salience  fold  classID  \
0    100032-3-0-0.wav  100032    0.0   0.317551         1     5        3   
1  100263-2-0-117.wav  100263   58.5  62.500000         1     5        2   
2  100263-2-0-121.wav  100263   60.5  64.500000         1     5        2   
3  100263-2-0-126.wav  100263   63.0  67.000000         1     5        2   
4  100263-2-0-137.wav  100263   68.5  72.500000         1     5        2   

              class  
0          dog_bark  
1  children_playing  
2  children_playing  
3  children_playing  
4  children_playing  


In [28]:

class Clip:
    """
    Objeto que representa un clip individual del UrbanSound8K.
    Emula el estilo de soundata:

    - clip.audio -> (y, sr)
    - clip.class_label
    - clip.class_id
    - clip.fold
    - clip.file_path
    """
    def __init__(self, clip_id, audio, sr, class_label, class_id, fold, file_path):
        self.clip_id = clip_id
        self.audio = (audio, sr)
        self.class_label = class_label
        self.class_id = class_id
        self.fold = int(fold)
        self.file_path = file_path

    def __repr__(self):
        return (
            f"Clip(id={self.clip_id}, class='{self.class_label}', "
            f"class_id={self.class_id}, fold={self.fold})"
        )


class UrbanSound8KDataset:
    """
    Dataset wrapper para UrbanSound8K que imita el comportamiento de soundata.

    Atributos principales:
    - metadata: DataFrame con todo el CSV
    - clip_ids: lista de IDs de clip (√≠ndices del DataFrame)
    - folds: dict {fold: DataFrame filtrada}

    M√©todos principales:
    - get_clip(clip_id) -> Clip
    - choice_clip() -> Clip aleatorio
    """

    def __init__(self, base_path="UrbanSound8K", sr=22050):
        self.base_path = base_path
        self.sr = sr

        # Cargar metadata
        metadata_path = os.path.join(base_path, "metadata", "UrbanSound8K.csv")
        if not os.path.exists(metadata_path):
            raise FileNotFoundError(
                f"No se encontr√≥ el archivo de metadata en: {metadata_path}"
            )

        self.metadata = pd.read_csv(metadata_path)

        # Limpiar posibles espacios en nombres de columnas
        self.metadata.columns = self.metadata.columns.str.strip()

        # IDs de clips (usamos el √≠ndice del DataFrame como ID)
        self.clip_ids = list(self.metadata.index)

        # Precomputar folds 1..10
        self.folds = {
            fold: self.metadata[self.metadata["fold"] == fold]
            for fold in range(1, 11)
        }

    def get_clip(self, clip_id):
        """
        Devuelve un objeto Clip, cargando el audio desde disco.

        clip_id debe ser un √≠ndice v√°lido de self.metadata (0..len-1).
        """
        if clip_id not in self.clip_ids:
            raise ValueError(f"clip_id {clip_id} no es v√°lido.")

        row = self.metadata.loc[clip_id]

        fold = int(row["fold"])
        filename = row["slice_file_name"]
        class_label = row["class"]
        class_id = int(row["classID"])

        audio_path = os.path.join(self.base_path, "audio", f"fold{fold}", filename)

        if not os.path.exists(audio_path):
            raise FileNotFoundError(
                f"No se encontr√≥ el archivo de audio en: {audio_path}"
            )

        # Cargar audio como mono, remuestreado a self.sr
        audio, sr = librosa.load(audio_path, sr=self.sr, mono=True)

        return Clip(
            clip_id=clip_id,
            audio=audio,
            sr=sr,
            class_label=class_label,
            class_id=class_id,
            fold=fold,
            file_path=audio_path,
        )

    def choice_clip(self):
        """Devuelve un Clip aleatorio del dataset."""
        cid = random.choice(self.clip_ids)
        return self.get_clip(cid)

    def __len__(self):
        return len(self.clip_ids)

    @property
    def num_clips(self):
        return len(self.clip_ids)

    def __repr__(self):
        return f"UrbanSound8KDataset(num_clips={len(self)}, folds=10)"


In [29]:
SR = 22050
DURATION = 4.0
N_MELS = 128
N_FFT = 1024
HOP_LENGTH = 512

dataset = UrbanSound8KDataset("UrbanSound8K", sr=SR)
example_clip = dataset.choice_clip()

def load_clip_melspec(clip):
    """
    Carga un clip de UrbanSound8K y devuelve log-mel spectrogram de shape (T, N_MELS)
    """
    y, sr = clip.audio


    # Resample
    if sr != SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=SR)

    # Duraci√≥n fija
    max_len = int(SR * DURATION)
    if len(y) < max_len:
        y = np.pad(y, (0, max_len - len(y)))
    else:
        y = y[:max_len]

    # Mel-spectrogram
    S = librosa.feature.melspectrogram(
        y=y,
        sr=SR,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=N_MELS
    )
    S_db = librosa.power_to_db(S, ref=np.max)  # (N_MELS, T)

    # Transponer a (T, N_MELS) para usar Conv1D sobre el tiempo
    return S_db.T


In [30]:
def load_data_for_fold_mel(dataset, test_fold):
    X_train, y_train = [], []
    X_test, y_test = [], []

    for cid in dataset.clip_ids:
        clip = dataset.get_clip(cid)

        mel = load_clip_melspec(clip)      # (T, N_MELS)
        label = clip.class_id
        fold = clip.fold

        if fold == test_fold:
            X_test.append(mel)
            y_test.append(label)
        else:
            X_train.append(mel)
            y_train.append(label)

    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)


In [31]:
def prepare_cnn_lstm_features(X):
    """
    Asegura que X sea un np.array de shape (num_samples, T, N_MELS)
    """
    return np.array(X)


In [32]:
def build_cnn_lstm_model(n_timesteps, n_mels):
    inputs = layers.Input(shape=(n_timesteps, n_mels))

    x = layers.Conv1D(64, kernel_size=5, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(pool_size=2)(x)

    x = layers.Conv1D(128, kernel_size=5, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(pool_size=2)(x)

    x = layers.LSTM(64)(x)
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(10, activation='softmax')(x)

    model = models.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


In [33]:
accuracies_cnn_lstm = []

print("\n===================================================")
print("üîµ INICIANDO 10-FOLD CROSS VALIDATION (CNN + LSTM)")
print("===================================================\n")

for fold in range(1, 11):

    print(f"\n===================================================")
    print(f"üöÄ Ejecutando Fold {fold} ...")
    print("===================================================\n")

    # 1. Cargar datos del fold con mel-spectrogram
    X_train_raw, y_train, X_test_raw, y_test = load_data_for_fold_mel(dataset, test_fold=fold)

    print(f"Fold {fold}: Train={len(X_train_raw)}, Test={len(X_test_raw)}")
    print(f"Mel shapes: train={X_train_raw.shape}, test={X_test_raw.shape}")

    # 2. Preparar tensores para CNN + LSTM (T, N_MELS)
    X_train_cnnlstm = prepare_cnn_lstm_features(X_train_raw)
    X_test_cnnlstm  = prepare_cnn_lstm_features(X_test_raw)

    n_timesteps = X_train_cnnlstm.shape[1]
    n_mels = X_train_cnnlstm.shape[2]

    print(f"Input shape para modelo: (timesteps={n_timesteps}, mels={n_mels})")

    # 3. Crear modelo
    cnn_lstm_model = build_cnn_lstm_model(n_timesteps, n_mels)

    # 4. Entrenar
    history = cnn_lstm_model.fit(
        X_train_cnnlstm, y_train,
        epochs=15,
        batch_size=32,
        validation_data=(X_test_cnnlstm, y_test),
        verbose=1
    )

    # 5. Evaluar
    test_loss, test_acc = cnn_lstm_model.evaluate(X_test_cnnlstm, y_test, verbose=0)
    accuracies_cnn_lstm.append(test_acc)

    print(f"\nüéØ Accuracy del Fold {fold}: {test_acc:.4f}")
    print("---------------------------------------------------")

# ---------------------------------------------------
# RESULTADOS FINALES
# ---------------------------------------------------
import numpy as np

print("\n===================================================")
print("üîµ RESULTADOS DEL 10-FOLD CROSS VALIDATION (CNN + LSTM)")
print("===================================================\n")

print("Accuracies por fold:", accuracies_cnn_lstm)

mean_acc = np.mean(accuracies_cnn_lstm)
std_acc  = np.std(accuracies_cnn_lstm)

print(f"\nüìå Accuracy promedio:      {mean_acc:.4f}")
print(f"üìå Desviaci√≥n est√°ndar:    {std_acc:.4f}")

print("\n===================================================")
print("üèÅ ENTRENAMIENTO COMPLETO (CNN + LSTM)")
print("===================================================")



üîµ INICIANDO 10-FOLD CROSS VALIDATION (CNN + LSTM)


üöÄ Ejecutando Fold 1 ...

Fold 1: Train=7859, Test=873
Mel shapes: train=(7859, 173, 128), test=(873, 173, 128)
Input shape para modelo: (timesteps=173, mels=128)
Epoch 1/15
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m11s[0m 29ms/step - accuracy: 0.1113 - loss: 2.2745 - val_accuracy: 0.1145 - val_loss: 2.2519
Epoch 2/15
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m12s[0m 48ms/step - accuracy: 0.1071 - loss: 2.2635 - val_accuracy: 0.1145 - val_loss: 2.2484
Epoch 3/15
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 0.1111 - loss: 2.2632 - val_accuracy: 0.1100 - val_loss: 2.2507
Epoch 4/15
[1m246/246[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 61ms/step - accuracy: 0.1130 - loss: 2.2637 - val

KeyboardInterrupt: 