In [15]:
import keras_tuner as kt
import numpy as np
import librosa
import librosa.util
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

In [22]:

DATASET_ROOT = r"C:/Users/david/Downloads/16000_pcm_speeches"
SPEAKERS = [
    "Benjamin_Netanyau",
    "Jens_Stoltenberg",
    "Julia_Gillard",
    "Magaret_Tarcher",
    "Nelson_Mandela",
]
CLASS_PATHS = [os.path.join(DATASET_ROOT, s) for s in SPEAKERS]

# Log-Mel parámetros (fijos para dar tamaño constante)
TARGET_SR   = 16000     # resample a 16 kHz
N_FFT       = 1024
HOP_LENGTH  = 256
N_MELS      = 64        # filas
N_FRAMES    = 64        # columnas (lo fijamos)
EPS         = 1e-10     # para evitar log(0)






In [24]:

def parse_dataset_mel(paths):
    X, y = [], []
    for label_idx, folder in enumerate(paths):
        print(f"[+] Parsing {folder} ...")
        for fname in os.listdir(folder):
            if not fname.lower().endswith(".wav"):
                continue
            fpath = os.path.join(folder, fname)

            wav, sr = librosa.load(fpath, sr=TARGET_SR, mono=True)

            mel = librosa.feature.melspectrogram(
                y=wav, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS
            )
            logmel = librosa.power_to_db(mel + EPS, ref=np.max)

           
            logmel_fixed = librosa.util.fix_length(logmel, size=N_FRAMES, axis=1)

            X.append(logmel_fixed.astype(np.float32))
            y.append(label_idx)
    return np.array(X), np.array(y)



In [25]:
X, y = parse_dataset_mel(CLASS_PATHS)
num_classes = len(SPEAKERS)

print("X shape (n, mels, frames):", X.shape)   # (N, 64, 64)
print("y shape:", y.shape)



[+] Parsing C:/Users/david/Downloads/16000_pcm_speeches\Benjamin_Netanyau ...
[+] Parsing C:/Users/david/Downloads/16000_pcm_speeches\Jens_Stoltenberg ...
[+] Parsing C:/Users/david/Downloads/16000_pcm_speeches\Julia_Gillard ...
[+] Parsing C:/Users/david/Downloads/16000_pcm_speeches\Magaret_Tarcher ...
[+] Parsing C:/Users/david/Downloads/16000_pcm_speeches\Nelson_Mandela ...
X shape (n, mels, frames): (7501, 64, 64)
y shape: (7501,)


In [26]:

X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.30, random_state=27, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)
# Proporciones: 70% train, 15% val, 15% test

X_train_flat = X_train.reshape(len(X_train), -1)
X_val_flat   = X_val.reshape(len(X_val), -1)
X_test_flat  = X_test.reshape(len(X_test), -1)

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_flat)
X_val_sc   = scaler.transform(X_val_flat)
X_test_sc  = scaler.transform(X_test_flat)

y_train_cat = to_categorical(y_train, num_classes)
y_val_cat   = to_categorical(y_val, num_classes)
y_test_cat  = to_categorical(y_test, num_classes)

input_dim = X_train_sc.shape[1]  # 4096
print("input_dim:", input_dim)



input_dim: 4096


In [27]:

def build_mlp(hp):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(input_dim,)))

    # Capa oculta 1
    units1 = hp.Int("units1", min_value=64, max_value=512, step=64)
    model.add(keras.layers.Dense(units1, activation="relu"))
    drop1 = hp.Float("dropout1", min_value=0.0, max_value=0.6, step=0.1)
    model.add(keras.layers.Dropout(drop1))

    #  segunda capa
    if hp.Boolean("add_layer2"):
        units2 = hp.Int("units2", min_value=64, max_value=512, step=64)
        model.add(keras.layers.Dense(units2, activation="relu"))
        drop2 = hp.Float("dropout2", min_value=0.0, max_value=0.6, step=0.1)
        model.add(keras.layers.Dropout(drop2))

    # Capa de salida
    model.add(keras.layers.Dense(num_classes, activation="softmax"))

    # LR a tunear
    lr = hp.Choice("lr", values=[1e-2, 5e-3, 1e-3, 5e-4, 1e-4])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model



In [31]:

tuner = kt.Hyperband(
    build_mlp,
    objective="val_accuracy",
    max_epochs=10,     
    factor=3,
    directory="nada",
    project_name="clasificaciondesonid0"
)

early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=3, restore_best_weights=True
)

tuner.search(
    X_train_sc, y_train_cat,
    validation_data=(X_val_sc, y_val_cat),
    epochs=10,
    callbacks=[early_stop],
    verbose=1
)


Trial 30 Complete [00h 00m 37s]
val_accuracy: 0.9902222156524658

Best val_accuracy So Far: 0.9919999837875366
Total elapsed time: 00h 11m 53s


In [32]:

best_hps = tuner.get_best_hyperparameters(1)[0]
print("Mejores HPs:", best_hps.values)


Mejores HPs: {'units1': 64, 'dropout1': 0.2, 'add_layer2': False, 'lr': 0.0001, 'units2': 512, 'dropout2': 0.2, 'tuner/epochs': 10, 'tuner/initial_epoch': 4, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0013'}


In [33]:
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(
    X_train_sc, y_train_cat,
    validation_data=(X_val_sc, y_val_cat),
    epochs=20,
    callbacks=[early_stop],
    verbose=1
)



Epoch 1/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.7503 - loss: 0.6759 - val_accuracy: 0.8898 - val_loss: 0.3176
Epoch 2/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9137 - loss: 0.2699 - val_accuracy: 0.9511 - val_loss: 0.1804
Epoch 3/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9505 - loss: 0.1686 - val_accuracy: 0.9769 - val_loss: 0.1200
Epoch 4/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9699 - loss: 0.1215 - val_accuracy: 0.9804 - val_loss: 0.0963
Epoch 5/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9752 - loss: 0.0985 - val_accuracy: 0.9742 - val_loss: 0.1074
Epoch 6/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9823 - loss: 0.0777 - val_accuracy: 0.9884 - val_loss: 0.0665
Epoch 7/20
[1m165/165

In [34]:

test_loss, test_acc = best_model.evaluate(X_test_sc, y_test_cat, verbose=0)
print(f"Test accuracy: {test_acc:.4f}")

y_pred = best_model.predict(X_test_sc).argmax(axis=1)



Test accuracy: 0.9893
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [42]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred, target_names=SPEAKERS))

                   precision    recall  f1-score   support

Benjamin_Netanyau       0.97      1.00      0.99       225
 Jens_Stoltenberg       0.99      0.99      0.99       225
    Julia_Gillard       1.00      0.96      0.98       226
  Magaret_Tarcher       1.00      1.00      1.00       225
   Nelson_Mandela       0.99      1.00      1.00       225

         accuracy                           0.99      1126
        macro avg       0.99      0.99      0.99      1126
     weighted avg       0.99      0.99      0.99      1126



In [43]:
print(confusion_matrix(y_test, y_pred))


[[225   0   0   0   0]
 [  1 223   0   1   0]
 [  5   3 216   0   2]
 [  0   0   0 225   0]
 [  0   0   0   0 225]]
