In [9]:
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers, models
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import joblib

In [12]:
DATASET_PATH = "C:/Users/Estudio/Documents/ParcialFinalDeepLearning/UrbanSound8K/UrbanSound8k" # ajusta si tu dataset está en otro lugar
METADATA_CSV = os.path.join(DATASET_PATH, "metadata", "UrbanSound8K.csv")
SAMPLE_RATE = 16000            # YAMNet usa 16 kHz
DURATION = 4.0                 # segundos (igual que usabas)
NUM_CLASSES = 10
BATCH_SIZE = 32
EPOCHS = 20
RANDOM_SEED = 42

# --- Cargar metadata ---
metadata = pd.read_csv(METADATA_CSV)
metadata.columns = metadata.columns.str.strip()

# --- Cargar YAMNet (TF-Hub) ---
# Nota: esto descargará el modelo la primera vez. Requiere internet.
yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
# yamnet(waveform) devuelve (scores, embeddings, spectrogram)
print("YAMNet cargado.")













YAMNet cargado.


In [13]:
def load_audio_for_yamnet(path, sr=SAMPLE_RATE, duration=DURATION):
    y, orig_sr = librosa.load(path, sr=None, mono=True)
    # Remuestrear si es necesario
    if orig_sr != sr:
        y = librosa.resample(y, orig_sr=orig_sr, target_sr=sr)
    max_len = int(sr * duration)
    if len(y) < max_len:
        y = np.pad(y, (0, max_len - len(y)))
    else:
        y = y[:max_len]
    # YAMNet espera float32 en [-1.0, 1.0]
    return y.astype(np.float32)

def extract_yamnet_embedding(waveform):
    """
    waveform: 1D numpy array a 16kHz float32
    devuelve: embedding agregado del clip (mean + std concatenadas)
    """
    # yamnet espera un tensor shape (num_samples,)
    scores, embeddings, spectrogram = yamnet(waveform)
    # embeddings: (num_patches, 1024)
    emb_np = embeddings.numpy()
    mean = np.mean(emb_np, axis=0)
    std = np.std(emb_np, axis=0)
    return np.concatenate([mean, std])  # tamaño 2048

In [14]:
# --- Precomputar features para todo el dataset (puede tardar) ---
features = []
labels = []
filepaths = []

for idx, row in tqdm(metadata.iterrows(), total=len(metadata), desc="Extrayendo features"):
    fold = int(row["fold"])
    filename = row["slice_file_name"]
    class_id = int(row["classID"])
    audio_path = os.path.join(DATASET_PATH, "audio", f"fold{fold}", filename)
    if not os.path.exists(audio_path):
        # opcional: omitir o lanzar error
        print("No encontrado:", audio_path)
        continue
    y = load_audio_for_yamnet(audio_path)
    emb = extract_yamnet_embedding(y)
    features.append(emb)
    labels.append(class_id)
    filepaths.append(audio_path)

features = np.array(features)   # (N, 2048)
labels = np.array(labels)
print("Features shape:", features.shape)


Extrayendo features: 100%|██████████| 8732/8732 [03:30<00:00, 41.47it/s]


Features shape: (8732, 2048)


In [15]:
# --- Normalizar features (fit sobre todo; para CV perfecto usar pipeline) ---
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Guardar scaler para inferencia
joblib.dump(scaler, "yamnet_scaler.joblib")


['yamnet_scaler.joblib']

In [16]:
# --- Clasificador simple (Dense) ---
def make_classifier_model(input_dim, num_classes=NUM_CLASSES):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(512, activation="relu")(inputs)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)
    model = models.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])
    return model

In [17]:
# --- Validación cruzada estratificada 10-fold ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
accs = []
fold_idx = 0

for train_idx, test_idx in skf.split(features_scaled, labels):
    fold_idx += 1
    print(f"\n=== Fold {fold_idx} ===")
    X_train, X_test = features_scaled[train_idx], features_scaled[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    model = make_classifier_model(input_dim=X_train.shape[1], num_classes=NUM_CLASSES)

    # EarlyStopping para no sobreentrenar
    es = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=4, restore_best_weights=True)

    model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[es],
        verbose=2
    )

    loss, acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Fold {fold_idx} accuracy: {acc:.4f}")
    accs.append(acc)


=== Fold 1 ===
Epoch 1/20
246/246 - 2s - 7ms/step - accuracy: 0.6525 - loss: 1.3702 - val_accuracy: 0.7620 - val_loss: 0.7014
Epoch 2/20
246/246 - 1s - 4ms/step - accuracy: 0.7443 - loss: 0.7804 - val_accuracy: 0.7757 - val_loss: 0.5785
Epoch 3/20
246/246 - 1s - 4ms/step - accuracy: 0.7826 - loss: 0.6500 - val_accuracy: 0.8078 - val_loss: 0.5767
Epoch 4/20
246/246 - 1s - 3ms/step - accuracy: 0.8016 - loss: 0.5830 - val_accuracy: 0.8318 - val_loss: 0.4637
Epoch 5/20
246/246 - 1s - 3ms/step - accuracy: 0.8195 - loss: 0.5322 - val_accuracy: 0.8398 - val_loss: 0.4731
Epoch 6/20
246/246 - 1s - 3ms/step - accuracy: 0.8285 - loss: 0.4998 - val_accuracy: 0.8558 - val_loss: 0.4594
Epoch 7/20
246/246 - 1s - 3ms/step - accuracy: 0.8433 - loss: 0.4664 - val_accuracy: 0.8764 - val_loss: 0.3842
Epoch 8/20
246/246 - 1s - 3ms/step - accuracy: 0.8475 - loss: 0.4482 - val_accuracy: 0.8444 - val_loss: 0.4000
Epoch 9/20
246/246 - 1s - 3ms/step - accuracy: 0.8517 - loss: 0.4315 - val_accuracy: 0.8719 - va

In [18]:
# --- Resultados ---
accs = np.array(accs)
print("\nAccuracies por fold:", accs)
print("Mean accuracy:", accs.mean(), "Std:", accs.std())



Accuracies por fold: [0.8901602  0.88443935 0.91867125 0.89919817 0.89805269 0.8613975
 0.91867125 0.89919817 0.8888889  0.87628865]
Mean accuracy: 0.8934966146945953 Std: 0.016715745784488416


In [19]:
# --- Entrenar modelo final en todo el set (opcional) ---
final_model = make_classifier_model(input_dim=features_scaled.shape[1], num_classes=NUM_CLASSES)
final_model.fit(features_scaled, labels, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=2)
final_model.save("yamnet_transfer_classifier.h5")
print("Modelo final guardado: yamnet_transfer_classifier.h5")

Epoch 1/20
273/273 - 2s - 6ms/step - accuracy: 0.6626 - loss: 1.4167
Epoch 2/20
273/273 - 1s - 5ms/step - accuracy: 0.7615 - loss: 0.7261
Epoch 3/20
273/273 - 1s - 3ms/step - accuracy: 0.7951 - loss: 0.6119
Epoch 4/20
273/273 - 1s - 3ms/step - accuracy: 0.8155 - loss: 0.5517
Epoch 5/20
273/273 - 1s - 3ms/step - accuracy: 0.8317 - loss: 0.5109
Epoch 6/20
273/273 - 1s - 5ms/step - accuracy: 0.8453 - loss: 0.4635
Epoch 7/20
273/273 - 1s - 3ms/step - accuracy: 0.8518 - loss: 0.4409
Epoch 8/20
273/273 - 1s - 3ms/step - accuracy: 0.8605 - loss: 0.4260
Epoch 9/20
273/273 - 1s - 4ms/step - accuracy: 0.8653 - loss: 0.4031
Epoch 10/20
273/273 - 1s - 4ms/step - accuracy: 0.8675 - loss: 0.4039
Epoch 11/20
273/273 - 1s - 4ms/step - accuracy: 0.8810 - loss: 0.3578
Epoch 12/20
273/273 - 1s - 3ms/step - accuracy: 0.8804 - loss: 0.3626
Epoch 13/20
273/273 - 1s - 3ms/step - accuracy: 0.8873 - loss: 0.3483
Epoch 14/20
273/273 - 1s - 3ms/step - accuracy: 0.8929 - loss: 0.3267
Epoch 15/20
273/273 - 1s - 3m



Modelo final guardado: yamnet_transfer_classifier.h5
