In [None]:
!pip install resampy



In [1]:
# ATENCIÓN!! Modificar ruta relativa a la práctica si es distinta (drive_root)
mount='/content/gdrive'
drive_root = mount + "/My Drive/TFM"

try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

In [2]:

# Switch to the directory on the Google Drive that you want to use
import os
if IN_COLAB:
  print("We're running Colab")

  if IN_COLAB:
    # Mount the Google Drive at mount
    print("Colab: mounting Google drive on ", mount)

    drive.mount(mount)

    # Create drive_root if it doesn't exist
    create_drive_root = True
    if create_drive_root:
      print("\nColab: making sure ", drive_root, " exists.")
      os.makedirs(drive_root, exist_ok=True)

    # Change to the directory
    print("\nColab: Changing directory to ", drive_root)
    %cd $drive_root
# Verify we're in the correct working directory
%pwd
print("Archivos en el directorio: ")
print(os.listdir())


We're running Colab
Colab: mounting Google drive on  /content/gdrive
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).

Colab: making sure  /content/gdrive/My Drive/TFM  exists.

Colab: Changing directory to  /content/gdrive/My Drive/TFM
/content/gdrive/My Drive/TFM
Archivos en el directorio: 
['Dataset_TFM.zip', 'Dataset_TFM_recortado.zip', 'TFM_V0.ipynb', 'TFM_V1.ipynb', 'Dataset']


### Utilizando Data Augmentation para aumentar la muestra

In [3]:
!pip install audiomentations



In [4]:
import os
import librosa
import numpy as np
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch
import soundfile as sf  # Para guardar archivos WAV

# Configuración
INPUT_FOLDER = "Dataset"
OUTPUT_FOLDER = "Dataset_Augmented"
AUGMENTATIONS_PER_FILE = 2  # Número de aumentos por audio

# Transformaciones (personaliza parámetros)
augmenter = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    PitchShift(min_semitones=-3, max_semitones=3, p=0.7),
    TimeStretch(min_rate=0.85, max_rate=1.15, p=0.5),
])

# Crear carpeta de salida si no existe
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Procesar cada subcarpeta (emociones)
for emotion in os.listdir(INPUT_FOLDER):
    emotion_path = os.path.join(INPUT_FOLDER, emotion)
    output_emotion_path = os.path.join(OUTPUT_FOLDER, emotion)
    os.makedirs(output_emotion_path, exist_ok=True)

    print(f"Procesando: {emotion}...")

    for audio_file in os.listdir(emotion_path):
        if not audio_file.endswith(".wav"):
            continue

        # Cargar audio
        input_path = os.path.join(emotion_path, audio_file)
        audio, sr = librosa.load(input_path, sr=None)  # sr=None mantiene la tasa de muestreo original

        # Guardar el audio original (opcional)
        output_path = os.path.join(output_emotion_path, audio_file)
        sf.write(output_path, audio, sr)

        # Generar audios aumentados
        for i in range(AUGMENTATIONS_PER_FILE):
            augmented_audio = augmenter(samples=audio, sample_rate=sr)
            output_aug_path = os.path.join(
                output_emotion_path,
                f"{os.path.splitext(audio_file)[0]}_aug{i+1}.wav"
            )
            sf.write(output_aug_path, augmented_audio, sr)

print("¡Data augmentation completado! Verifica la carpeta:", OUTPUT_FOLDER)

Procesando: Alegria...
Procesando: Enojo...
Procesando: Miedo...
Procesando: Tristeza...
Procesando: Neutral...
Procesando: Sorpresa...
¡Data augmentation completado! Verifica la carpeta: Dataset_Augmented


In [1]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Configuración
emotion_labels = ['Alegria', 'Enojo', 'Miedo', 'Neutral', 'Sorpresa', 'Tristeza']
dataset_path = 'Dataset_Augmented/'
sample_rate = 22050
max_duration = 3  # segundos
n_mfcc = 40

def extract_features(file_path):
    audio, sr = librosa.load(file_path, sr=sample_rate, duration=max_duration)
    if sr != sample_rate:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    return np.mean(mfcc.T, axis=0)  # resultado: (40,)
'''
def extract_features(file_path):
    audio, sr = librosa.load(file_path, sr=sample_rate, duration=max_duration)
    if sr != sample_rate:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=1)  # 1 sola dimensión por frame
    return mfcc.T  # Resultado: (tiempo, 1) → típicamente (1024, 1)

max_frames = 1024
n_mfcc = 1

def extract_features(file_path):
    audio, sr = librosa.load(file_path, sr=22050, duration=3.0)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).T  # → (T, 1)

    # Padding o truncado para que todas tengan exactamente 1024 frames
    if mfcc.shape[0] < max_frames:
        padding = np.zeros((max_frames - mfcc.shape[0], n_mfcc))
        mfcc = np.vstack((mfcc, padding))
    else:
        mfcc = mfcc[:max_frames, :]

    return mfcc  # forma final: (1024, 1)
'''


X, y = [], []
for label in emotion_labels:
    folder_path = os.path.join(dataset_path, label)
    for file in os.listdir(folder_path):
        if file.endswith('.wav'):
            features = extract_features(os.path.join(folder_path, file))
            X.append(features)
            y.append(label)

X = np.array(X)
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


FileNotFoundError: [Errno 2] No such file or directory: 'Dataset_TFM_recortado/Alegria'

In [None]:
print(X_train.shape)

(1169, 1024, 1)


### Codigo ideal con YAMNet

In [None]:
#!pip install librosa tensorflow tensorflow_hub scikit-learn soundfile

In [None]:

# ✅ Requiere instalación previa:
# pip install librosa tensorflow tensorflow_hub scikit-learn soundfile

import os
import numpy as np
import librosa
import tensorflow_hub as hub
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# --- Configuración ---
DATASET_PATH = 'Dataset_TFM_recortado'  # Estructura: dataset/Alegria/, dataset/Enojo/, etc.
EMOTIONS = ['Alegria', 'Enojo', 'Miedo', 'Neutral', 'Sorpresa', 'Tristeza']
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'

# --- Cargar modelo YAMNet ---
print("Cargando modelo YAMNet...")
yamnet_model = hub.load(yamnet_model_handle)

# --- Función para extraer embeddings ---
def extract_yamnet_embedding(file_path):
    try:
        waveform, sr = librosa.load(file_path, sr=16000)
        scores, embeddings, spectrogram = yamnet_model(waveform)
        return np.mean(embeddings.numpy(), axis=0)
    except Exception as e:
        print(f"Error en {file_path}: {e}")
        return None

# --- Extraer características ---
X = []
y = []

print("Extrayendo embeddings de YAMNet...")
for emotion in EMOTIONS:
    emotion_path = os.path.join(DATASET_PATH, emotion)
    for fname in os.listdir(emotion_path):
        if fname.endswith(".wav"):
            file_path = os.path.join(emotion_path, fname)
            embedding = extract_yamnet_embedding(file_path)
            if embedding is not None:
                X.append(embedding)
                y.append(emotion)

X = np.array(X)
y = LabelEncoder().fit_transform(y)

# --- División entrenamiento/prueba ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Entrenamiento con Random Forest ---
print("Entrenando clasificador Random Forest...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# --- Evaluación ---
y_pred = clf.predict(X_test)
print("\n--- Reporte de clasificación ---")
print(classification_report(y_test, y_pred, target_names=EMOTIONS))

print("\n--- Matriz de confusión ---")
print(confusion_matrix(y_test, y_pred))

# --- Validación cruzada (opcional) ---
# scores = cross_val_score(clf, X, y, cv=5)
# print(f"Precisión media (cross-validation): {np.mean(scores):.2f}")

# --- Guardar modelo entrenado ---
joblib.dump(clf, "modelo_emociones_yamnet.pkl")
print("✅ Modelo guardado como modelo_emociones_yamnet.pkl")


Cargando modelo YAMNet...
Extrayendo embeddings de YAMNet...
Entrenando clasificador Random Forest...

--- Reporte de clasificación ---
              precision    recall  f1-score   support

     Alegria       0.43      0.43      0.43        46
       Enojo       0.39      0.40      0.39        48
       Miedo       0.35      0.34      0.35        50
     Neutral       0.22      0.26      0.24        50
    Sorpresa       0.50      0.38      0.43        48
    Tristeza       0.36      0.37      0.37        51

    accuracy                           0.36       293
   macro avg       0.37      0.36      0.37       293
weighted avg       0.37      0.36      0.37       293


--- Matriz de confusión ---
[[20 11  1 10  3  1]
 [ 6 19  7  4  8  4]
 [ 3  7 17  9  0 14]
 [ 8  4  7 13  5 13]
 [ 9  7  3  9 18  2]
 [ 1  1 13 15  2 19]]
✅ Modelo guardado como modelo_emociones_yamnet.pkl


### Modelo 1: CNN simple

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization

X_train_cnn = X_train[..., np.newaxis]
X_test_cnn = X_test[..., np.newaxis]

model_cnn = Sequential([
    Conv1D(64, 5, activation='relu', input_shape=(n_mfcc, 1)),
    MaxPooling1D(2),
    BatchNormalization(),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(emotion_labels), activation='softmax')
])

model_cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_cnn.fit(X_train_cnn, y_train, epochs=50, batch_size=32, validation_data=(X_test_cnn, y_test))

# --- Predicciones ---
y_pred_probs = model_cnn.predict(X_test_cnn)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Matriz de confusión ---
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=emotion_labels,
            yticklabels=emotion_labels)
plt.xlabel('Predicción')
plt.ylabel('Etiqueta real')
plt.title('Matriz de Confusión - CNN Simple')
plt.tight_layout()
plt.show()

# --- Reporte de clasificación ---
print("\n📋 Clasificación por clase:")
print(classification_report(y_test, y_pred, target_names=emotion_labels))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_14" is incompatible with the layer: expected axis -1 of input shape to have value 896, but received input with shape (None, 32384)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 1024, 1), dtype=float32)
  • training=True
  • mask=None
  • kwargs=<class 'inspect._empty'>

In [None]:
print(X_train.shape)        # debería ser (N, 40)
print(X_train_cnn.shape)    # debería ser (N, 40, 1)

(1169, 40)
(1169, 40, 1)


### Modelo 2: LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Reshape

X_train_lstm = X_train.reshape(-1, 10, 4)
X_test_lstm = X_test.reshape(-1, 10, 4)

model_lstm = Sequential([
    LSTM(64, return_sequences=False, input_shape=(10, 4)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(len(emotion_labels), activation='softmax')
])

model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_lstm.fit(X_train_lstm, y_train, epochs=50, batch_size=32, validation_data=(X_test_lstm, y_test))


Epoch 1/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.1747 - loss: 1.8138 - val_accuracy: 0.1706 - val_loss: 1.7857
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1932 - loss: 1.7802 - val_accuracy: 0.1775 - val_loss: 1.7827
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2160 - loss: 1.7771 - val_accuracy: 0.1809 - val_loss: 1.7853
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2222 - loss: 1.7534 - val_accuracy: 0.1775 - val_loss: 1.7763
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2529 - loss: 1.7330 - val_accuracy: 0.1809 - val_loss: 1.7687
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2870 - loss: 1.7213 - val_accuracy: 0.1775 - val_loss: 1.7693
Epoch 7/50
[1m37/37[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x21513d3db20>

### Modelo 3: CNN + LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout

X_train_cnn_lstm = X_train.reshape(-1, 10, 4)
X_test_cnn_lstm = X_test.reshape(-1, 10, 4)

model_cnn_lstm = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(10, 4)),
    MaxPooling1D(2),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(len(emotion_labels), activation='softmax')
])

model_cnn_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_cnn_lstm.fit(X_train_cnn_lstm, y_train, epochs=50, batch_size=32, validation_data=(X_test_cnn_lstm, y_test))


Epoch 1/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.1617 - loss: 1.8324 - val_accuracy: 0.1911 - val_loss: 1.7987
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2062 - loss: 1.7800 - val_accuracy: 0.1843 - val_loss: 1.7995
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1777 - loss: 1.7822 - val_accuracy: 0.1536 - val_loss: 1.7891
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2482 - loss: 1.7361 - val_accuracy: 0.2014 - val_loss: 1.7836
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2634 - loss: 1.7127 - val_accuracy: 0.1809 - val_loss: 1.7841
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2578 - loss: 1.6972 - val_accuracy: 0.1945 - val_loss: 1.7801
Epoch 7/50
[1m37/37[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x215092aab10>

### Modelo 4: Transfer Learning con YAMNet (TensorFlow Hub)

In [None]:
import tensorflow_hub as hub

yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

def extract_yamnet_embeddings(file_path):
    waveform, sr = librosa.load(file_path, sr=16000)
    scores, embeddings, _ = yamnet_model(waveform)
    return np.mean(embeddings.numpy(), axis=0)

X_yam, y_yam = [], []
for label in emotion_labels:
    folder_path = os.path.join(dataset_path, label)
    for file in os.listdir(folder_path):
        if file.endswith('.wav'):
            emb = extract_yamnet_embeddings(os.path.join(folder_path, file))
            X_yam.append(emb)
            y_yam.append(label)

X_yam = np.array(X_yam)
y_yam = LabelEncoder().fit_transform(y_yam)
X_train_yam, X_test_yam, y_train_yam, y_test_yam = train_test_split(X_yam, y_yam, test_size=0.2)

# Clasificador final
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_yam, y_train_yam)
print("Precisión:", rf_model.score(X_test_yam, y_test_yam))


Precisión: 0.310580204778157


In [None]:
results = {
    "CNN": model_cnn.evaluate(X_test_cnn, y_test, verbose=0)[1],
    "LSTM": model_lstm.evaluate(X_test_lstm, y_test, verbose=0)[1],
    "CNN+LSTM": model_cl.evaluate(X_test_cl, y_test, verbose=0)[1],
    "YAMNet": rf.score(X_test_yam, y_test_yam)
}

for model, acc in results.items():
    print(f"{model}: {acc*100:.2f}%")


NameError: name 'model_cl' is not defined