In [211]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
import os
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import joblib

In [296]:
SR = 22050
CLIP_DURATION_SEC = 2.0
TARGET_LENGTH_FRAMES = 87
N_MFCC = 40
N_CHROMA = 12
N_CONTRAST_BANDS = 6
N_TONNETZ = 6
TOTAL_FEATURES = N_MFCC + 1 + 1 + (N_CONTRAST_BANDS + 1) + N_CHROMA + N_TONNETZ # 40
print(f"Parâmetros Definidos: SR={SR}, TargetFrames={TARGET_LENGTH_FRAMES}, TotalFeatures={TOTAL_FEATURES}")


Parâmetros Definidos: SR=22050, TargetFrames=87, TotalFeatures=67


In [297]:
SCALER_PATH = 'final_scaler_3.save'
MODEL_GRU_PATH = 'best_model_gru_2.keras'
MODEL_LSTM_PATH = 'best_model_lstm_2.keras'

MODEL_GRU_TEST_PATH = 'saved_models/CNN_BiGRU/best_overall.keras'
MODEL_GRU_PATH = MODEL_GRU_TEST_PATH
MODEL_LSTM_TEST_PATH = 'saved_models/CNN_BiLSTM/best_overall.keras'
MODEL_LSTM_PATH = MODEL_LSTM_TEST_PATH

In [298]:

def pad_or_truncate_time(array_2d, target_length):
    current_length = array_2d.shape[0]
    n_features = array_2d.shape[1]
    if current_length == target_length: return array_2d
    if current_length < target_length:
        pad_width = target_length - current_length
        padded_array = np.pad(array_2d, ((0, pad_width), (0, 0)), mode='constant')
    else:
        padded_array = array_2d[:target_length, :]
    return padded_array

def load_and_process_features_for_clip(y_clip, sr=SR, n_mfcc=N_MFCC, n_chroma=N_CHROMA, n_contrast_bands=N_CONTRAST_BANDS, n_tonnetz=N_TONNETZ):
    try:
        if len(y_clip) < 44100:
            print(len(y_clip))
        if len(y_clip) < 2048: return None
        if len(y_clip) == 0: return None
        mfcc = librosa.feature.mfcc(y=y_clip, sr=sr, n_mfcc=n_mfcc)
        zcr = librosa.feature.zero_crossing_rate(y=y_clip)
        bw = librosa.feature.spectral_bandwidth(y=y_clip, sr=sr)
        contrast = librosa.feature.spectral_contrast(y=y_clip, sr=sr, n_bands=n_contrast_bands)
        chroma = librosa.feature.chroma_stft(y=y_clip, sr=sr, n_chroma=n_chroma)
        tonnetz = librosa.feature.tonnetz(y=y_clip, sr=sr)

        combined = np.hstack((mfcc.T, zcr.T, bw.T, contrast.T, chroma.T, tonnetz.T))
        return combined
    except Exception as e:
        print(f"Erro extraindo features do clipe: {e}")
        return None


In [299]:
print("\nCarregando scaler e modelos...")

if not os.path.exists(SCALER_PATH):
    raise FileNotFoundError(f"Arquivo do scaler não encontrado em: {SCALER_PATH}. Salve o scaler após o treino.")
try:
    scaler_x = joblib.load(SCALER_PATH)
except Exception as e:
    print(f"Erro ao carregar scaler: {e}")
    raise

if not os.path.exists(MODEL_GRU_PATH): raise FileNotFoundError(f"Modelo GRU não encontrado: {MODEL_GRU_PATH}")
if not os.path.exists(MODEL_LSTM_PATH): raise FileNotFoundError(f"Modelo LSTM não encontrado: {MODEL_LSTM_PATH}")
try:
    model_gru = keras.models.load_model(MODEL_GRU_PATH)
    print(f"Modelo GRU '{MODEL_GRU_PATH}' carregado.")
    model_lstm = keras.models.load_model(MODEL_LSTM_PATH)
    print(f"Modelo LSTM '{MODEL_LSTM_PATH}' carregado.")
except Exception as e:
    print(f"Erro ao carregar modelo Keras: {e}")
    raise

print("Scaler e modelos prontos.")



Carregando scaler e modelos...
Modelo GRU 'saved_models/CNN_BiGRU/best_overall.keras' carregado.
Modelo LSTM 'saved_models/CNN_BiLSTM/best_overall.keras' carregado.
Scaler e modelos prontos.


In [300]:

def process_audio_file(NEW_AUDIO_PATH):
    print("Carregando áudio...")
    try:
        y_full, sr_loaded = librosa.load(NEW_AUDIO_PATH, sr=SR)
        if len(y_full) < 2048: 
            return None # Ignora este arquivo
        # ---------------------------------------------

        if len(y_full) == 0: return None
        if sr_loaded != SR: print(f"Aviso: Áudio carregado com sr={sr_loaded}, esperado={SR}.")
        print(f"Áudio carregado: Duração={len(y_full)/SR:.2f} segundos")
    except Exception as e:
        print(f"Erro ao carregar áudio: {e}")
        exit()

    # Processamento por janela
    clip_duration_samples = int(CLIP_DURATION_SEC * SR)
    hop_samples = clip_duration_samples

    gru_predictions_over_time = []
    lstm_predictions_over_time = []
    timestamps = []
    num_segments = 0

    print(f"Analisando em segmentos de {CLIP_DURATION_SEC} segundos...")
    for start_sample in range(0, len(y_full) - clip_duration_samples + 1, hop_samples):
        end_sample = start_sample + clip_duration_samples
        y_clip = y_full[start_sample:end_sample]
        segment_start_time_sec = start_sample / SR

        try:
            combined_features = load_and_process_features_for_clip(y_clip)
            if combined_features is None: continue

            features_padded = pad_or_truncate_time(combined_features, TARGET_LENGTH_FRAMES)
            if features_padded.shape != (TARGET_LENGTH_FRAMES, TOTAL_FEATURES):
                 # print(f"Shape {features_padded.shape} != {(TARGET_LENGTH_FRAMES, TOTAL_FEATURES)} no seg {segment_start_time_sec:.1f}s. Pulando.")
                 continue

            features_reshaped = features_padded.reshape(-1, TOTAL_FEATURES)
            features_scaled = scaler_x.transform(features_reshaped)
            processed_clip = features_scaled.reshape(1, TARGET_LENGTH_FRAMES, TOTAL_FEATURES)
            processed_clip = processed_clip.astype(np.float32)
            pred_gru = model_gru.predict(processed_clip, verbose=0)[0]
            pred_lstm = model_lstm.predict(processed_clip, verbose=0)[0]
            gru_predictions_over_time.append(pred_gru)
            lstm_predictions_over_time.append(pred_lstm)
            timestamps.append(segment_start_time_sec)
            num_segments += 1

        except Exception as e:
            print(f"Erro processando segmento em {segment_start_time_sec:.1f}s: {e}")
            continue

    print(f"Processamento concluído. {num_segments} segmentos analisados.")
    return timestamps, gru_predictions_over_time, lstm_predictions_over_time


In [301]:
def plot_arousal_valence(timestamps, gru_predictions_over_time, lstm_predictions_over_time, NEW_AUDIO_PATH):
        gru_preds = np.array(gru_predictions_over_time)
        lstm_preds = np.array(lstm_predictions_over_time)
        timestamps_np = np.array(timestamps)
        median_arousal_gru = np.median(gru_preds[:, 0])
        median_valence_gru = np.median(gru_preds[:, 1])
        median_arousal_lstm = np.median(lstm_preds[:, 0])
        median_valence_lstm = np.median(lstm_preds[:, 1])

        print("\n--- Emoção Geral Média Estimada ---")
        print(f"GRU  (Escala [-1, 1]) -> Arousal: {median_arousal_gru:.4f}, Valence: {median_valence_gru:.4f}")
        print(f"LSTM (Escala [-1, 1]) -> Arousal: {median_arousal_lstm:.4f}, Valence: {median_valence_lstm:.4f}")
        print("-" * 20)

        # Plotar Gráfico Temporal [-1, 1]
        print("\nGerando gráfico temporal das previsões [-1, 1]...")
        plt.figure(figsize=(12, 7))
        plt.subplot(2, 1, 1)
        plt.plot(timestamps_np, gru_preds[:, 0], marker='.', linestyle='-', markersize=4, label='Arousal GRU', color='red', alpha=0.8)
        plt.plot(timestamps_np, lstm_preds[:, 0], marker='x', linestyle='--', markersize=4, label='Arousal LSTM', color='orange', alpha=0.8)
        plt.axhline(0, color='gray', linestyle='--', linewidth=0.8)
        plt.ylabel("Arousal [-1 a 1]")
        plt.title(f"Evolução Temporal de Arousal e Valence - {os.path.basename(NEW_AUDIO_PATH)}")
        plt.legend(loc='upper right')
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.ylim(-1.1, 1.1)
        plt.subplot(2, 1, 2)
        plt.plot(timestamps_np, gru_preds[:, 1], marker='.', linestyle='-', markersize=4, label='Valence GRU', color='blue', alpha=0.8)
        plt.plot(timestamps_np, lstm_preds[:, 1], marker='x', linestyle='--', markersize=4, label='Valence LSTM', color='cyan', alpha=0.8)
        plt.axhline(0, color='gray', linestyle='--', linewidth=0.8)
        plt.ylabel("Valence [-1 a 1]")
        plt.xlabel("Tempo (Início do Segmento em segundos)")
        plt.legend(loc='upper right')
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.ylim(-1.1, 1.1)
        plt.tight_layout(pad=2.0)
        plt.show()

        print("\nGerando gráfico da Emoção Geral Mediana Estimada [-1, 1]...")
        plt.figure(figsize=(7, 7))
        plt.axhline(0, color='gray', linestyle='-', linewidth=0.8)
        plt.axvline(0, color='gray', linestyle='-', linewidth=0.8)
        plt.scatter([median_valence_gru], [median_arousal_gru], color='green', marker='o', s=120, label=f'Média GRU\n(A:{median_arousal_gru:.2f}, V:{median_valence_gru:.2f})', zorder=5)
        plt.scatter([median_valence_lstm], [median_arousal_lstm], color='purple', marker='s', s=120, label=f'Média LSTM\n(A:{median_arousal_lstm:.2f}, V:{median_valence_lstm:.2f})', zorder=5)
        plt.text(0.5, 0.5, 'Alegria /\nExcitação', horizontalalignment='center', verticalalignment='center', fontsize=11, alpha=0.7)
        plt.text(-0.5, 0.5, 'Raiva /\nAngústia', horizontalalignment='center', verticalalignment='center', fontsize=11, alpha=0.7)
        plt.text(-0.5, -0.5, 'Tristeza /\nTédio', horizontalalignment='center', verticalalignment='center', fontsize=11, alpha=0.7)
        plt.text(0.5, -0.5, 'Calma /\nContentamento', horizontalalignment='center', verticalalignment='center', fontsize=11, alpha=0.7)
        plt.title(f"Mediana Estimada - {os.path.basename(NEW_AUDIO_PATH)}")
        plt.xlabel("Valência [0 (Neg) a 1 (Pos)]")
        plt.ylabel("Arousal [0 (Calmo) a 1 (Excitado)]")
        plt.xlim(-1, 1)
        plt.ylim(-1, 1)
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.gca().set_aspect('equal', adjustable='box')
        plt.legend(fontsize=9)
        plt.show()

In [302]:

def calculate_quadrant_percentage(arousal, valence):
    total_segments = len(arousal)
    if total_segments == 0:
        return [0, 0, 0, 0]

    count_alegre = np.sum((valence > 0) & (arousal > 0))
    count_calmo = np.sum((valence > 0) & (arousal <= 0))
    count_triste = np.sum((valence <= 0) & (arousal <= 0))
    count_raiva = np.sum((valence <= 0) & (arousal > 0))

    perc_alegre = count_alegre / total_segments
    perc_calmo = count_calmo / total_segments
    perc_triste = count_triste / total_segments
    perc_raiva = count_raiva / total_segments

    return [perc_raiva, perc_alegre, perc_triste, perc_calmo]

def plot_point_arousal_valence(arousal_gru, valence_gru, arousal_lstm, valence_lstm, music_name="Música"):
    # plotar todos os pontos e a mediana em destaque
    plt.figure(figsize=(7, 7))
    plt.axhline(0, color='gray', linestyle='-', linewidth=0.8)
    plt.axvline(0, color='gray', linestyle='-', linewidth=0.8)
    plt.scatter(valence_gru, arousal_gru, color='green', marker='o', s=30, alpha=0.5, label='GRU Pontos', zorder=3)
    plt.scatter(valence_lstm, arousal_lstm, color='purple', marker='s', s=30, alpha=0.5, label='LSTM Pontos', zorder=3)
    median_arousal_gru = np.median(arousal_gru)
    median_valence_gru = np.median(valence_gru)
    median_arousal_lstm = np.median(arousal_lstm)
    median_valence_lstm = np.median(valence_lstm)
    plt.scatter([median_valence_gru], [median_arousal_gru], color='darkgreen', marker='o', s=150, label=f'GRU Mediana\n(A:{median_arousal_gru:.2f}, V:{median_valence_gru:.2f})', zorder=5)
    plt.scatter([median_valence_lstm], [median_arousal_lstm], color='indigo', marker='s', s=150, label=f'LSTM Mediana\n(A:{median_arousal_lstm:.2f}, V:{median_valence_lstm:.2f})', zorder=5)
    plt.text(0.5, 0.5, 'Alegria /\nExcitação', horizontalalignment='center', verticalalignment='center', fontsize=11, alpha=0.7)
    plt.text(-0.5, 0.5, 'Raiva /\nAngústia', horizontalalignment='center', verticalalignment='center', fontsize=11, alpha=0.7)
    plt.text(-0.5, -0.5, 'Tristeza /\nTédio', horizontalalignment='center', verticalalignment='center', fontsize=11, alpha=0.7)
    plt.text(0.5, -0.5, 'Calma /\nContentamento', horizontalalignment='center', verticalalignment='center', fontsize=11, alpha=0.7)
    plt.title(f"Distribuição de Arousal e Valence - {music_name}")
    plt.xlabel("Valência [-1 (Neg) a 1 (Pos)]")
    plt.ylabel("Arousal [-1 (Calmo) a 1 (Excitado)]")
    plt.xlim(-1, 1)
    plt.ylim(-1, 1)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.legend(fontsize=9)
    plt.show()
    # plotar separadamente os modelos
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.axhline(0, color='gray', linestyle='-', linewidth=0.8)
    plt.axvline(0, color='gray', linestyle='-', linewidth=0.8)
    plt.scatter(valence_gru, arousal_gru, color='green', marker='o', s=30, alpha=0.5, label='GRU Pontos', zorder=3)
    plt.scatter([median_valence_gru], [median_arousal_gru], color='darkgreen', marker='o', s=150, label=f'GRU Mediana\n(A:{median_arousal_gru:.2f}, V:{median_valence_gru:.2f})', zorder=5)
    plt.title(f"Distribuição de Arousal e Valence - GRU - {music_name}")
    plt.xlabel("Valência [-1 (Neg) a 1 (Pos)]")
    plt.ylabel("Arousal [-1 (Calmo) a 1 (Excitado)]")
    plt.xlim(-1, 1)
    plt.ylim(-1, 1)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.legend(fontsize=9)
    plt.subplot(1, 2, 2)
    plt.axhline(0, color='gray', linestyle='-', linewidth=0.8)
    plt.axvline(0, color='gray', linestyle='-', linewidth=0.8)
    plt.scatter(valence_lstm, arousal_lstm, color='purple', marker='s', s=30, alpha=0.5, label='LSTM Pontos', zorder=3)
    plt.scatter([median_valence_lstm], [median_arousal_lstm], color='indigo', marker='s', s=150, label=f'LSTM Mediana\n(A:{median_arousal_lstm:.2f}, V:{median_valence_lstm:.2f})', zorder=5)
    plt.title(f"Distribuição de Arousal e Valence - LSTM - {music_name}")
    plt.xlabel("Valência [-1 (Neg) a 1 (Pos)]")
    plt.ylabel("Arousal [-1 (Calmo) a 1 (Excitado)]")
    plt.xlim(-1, 1)
    plt.ylim(-1, 1)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.legend(fontsize=9)
    plt.show()

def plot_combined_radar_by_percentage(arousal_gru, valence_gru, arousal_lstm, valence_lstm, music_name="Música"):
    valores_gru = calculate_quadrant_percentage(arousal_gru, valence_gru)
    valores_lstm = calculate_quadrant_percentage(arousal_lstm, valence_lstm)
    
    categorias = ["Raiva", "Alegre", "Triste", "Calmo"]
    N = len(categorias)
    
    valores_gru += valores_gru[:1]
    valores_lstm += valores_lstm[:1]

    angulos = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
    angulos += angulos[:1]
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

    line_gru, = ax.plot(angulos, valores_gru, linewidth=2, linestyle="solid", color="green", label="GRU")
    ax.fill(angulos, valores_gru, alpha=0.25, color="green")

    # Plotar LSTM
    line_lstm, = ax.plot(angulos, valores_lstm, linewidth=2, linestyle="dashed", color="purple", label="LSTM")
    ax.fill(angulos, valores_lstm, alpha=0.25, color="purple")

    ax.set_xticks(angulos[:-1])
    ax.set_xticklabels(categorias, fontsize=12, fontweight="bold")

    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1) # Sentido horário

    ax.set_yticks(np.linspace(0, 1, 5))
    ax.set_yticklabels(["0%", "25%", "50%", "75%", "100%"])
    ax.set_rlabel_position(0)
    # Display percentage as legend

    legenda_gru = "GRU - " + ", ".join([
        f"{cat}: {valores_gru[i]*100:.1f}%" for i, cat in enumerate(categorias)
    ])
    legenda_lstm = "LSTM - " + ", ".join([
        f"{cat}: {valores_lstm[i]*100:.1f}%" for i, cat in enumerate(categorias)
    ])

    # ax.legend([legenda_gru, legenda_lstm], loc='upper right', bbox_to_anchor=(1.6, 1.1))
    ax.legend(
        [line_gru, line_lstm],
        [legenda_gru, legenda_lstm],
        loc='upper right',
        bbox_to_anchor=(1.6, 1.1),
        fontsize=10,
        frameon=True
    )
    ax.set_title(f"Distribuição de Tempo por Quadrante - {music_name}", fontsize=16, pad=20)

    plt.show()

In [None]:
SR = 22050
CLIP_DURATION_SEC = 2.0
TARGET_LENGTH_FRAMES = 87
N_MFCC = 40
N_CHROMA = 12
N_CONTRAST_BANDS = 6
N_TONNETZ = 6
TOTAL_FEATURES = N_MFCC + 1 + 1 + (N_CONTRAST_BANDS + 1) + N_CHROMA + N_TONNETZ # 40
print(f"Parâmetros Definidos: SR={SR}, TargetFrames={TARGET_LENGTH_FRAMES}, TotalFeatures={TOTAL_FEATURES}")
settings = {
    '2S': {
        'SR': 22050,
        'CLIP_DURATION_SEC': 2.0,
        'TARGET_LENGTH_FRAMES': 87,
        'N_MFCC': 40,
        'N_CHROMA': 12,
        'N_CONTRAST_BANDS': 6,
        'N_TONNETZ': 6,
        'TOTAL_FEATURES': 67,
        'SCALER_PATH': 'final_scaler_2.save',
        'MODEL_GRU_PATH': 'saved_models/CNN_BiGRU/best_overall.keras',
        'MODEL_LSTM_PATH': 'saved_models/CNN_BiLSTM/best_overall.keras'
    },
    '5S': {
        'SR': 22050,
        'CLIP_DURATION_SEC': 5.0,
        'TARGET_LENGTH_FRAMES': 217,
        'N_MFCC': 40,
        'N_CHROMA': 12,
        'N_CONTRAST_BANDS': 6,
        'N_TONNETZ': 6,
        'TOTAL_FEATURES': 67,
        'SCALER_PATH': 'final_scaler_3.save',
        'MODEL_GRU_PATH': 'saved_models_5s/CNN_BiGRU/best_overall.keras',
        'MODEL_LSTM_PATH': 'saved_models_5s/CNN_BiLSTM/best_overall.keras'
    }
}
setting_used = '2S'  # Alterar para '5S' se necessário
SR = settings[setting_used]['SR']
CLIP_DURATION_SEC = settings[setting_used]['CLIP_DURATION_SEC']
TARGET_LENGTH_FRAMES = settings[setting_used]['TARGET_LENGTH_FRAMES']
N_MFCC = settings[setting_used]['N_MFCC']
N_CHROMA = settings[setting_used]['N_CHROMA']
N_CONTRAST_BANDS = settings[setting_used]['N_CONTRAST_BANDS']
N_TONNETZ = settings[setting_used]['N_TONNETZ']
TOTAL_FEATURES = settings[setting_used]['TOTAL_FEATURES']

print(f"Parâmetros Definidos: SR={SR}, TargetFrames={TARGET_LENGTH_FRAMES}, TotalFeatures={TOTAL_FEATURES}")
SCALER_PATH = settings[setting_used]['SCALER_PATH']
MODEL_GRU_PATH = settings[setting_used]['MODEL_GRU_PATH']
MODEL_LSTM_PATH = settings[setting_used]['MODEL_LSTM_PATH']


Parâmetros Definidos: SR=22050, TargetFrames=87, TotalFeatures=67
Parâmetros Definidos: SR=22050, TargetFrames=217, TotalFeatures=67


In [None]:
NEW_AUDIO_PATH = 'test/goodbye_my_lover.mp3'
MUSIC_NAME = NEW_AUDIO_PATH.split('/')[-1].rsplit('.', 1)[0]
timestamps, gru_predictions_over_time, lstm_predictions_over_time = process_audio_file(NEW_AUDIO_PATH)
if not timestamps:
        print("Nenhuma previsão foi gerada para a música.")
else:
        # plot_arousal_valence(timestamps, gru_predictions_over_time, lstm_predictions_over_time, NEW_AUDIO_PATH)
        gru_preds = np.array(gru_predictions_over_time)
        lstm_preds = np.array(lstm_predictions_over_time)
        arousal_gru = gru_preds[:, 0]
        valence_gru = gru_preds[:, 1]

        arousal_lstm = lstm_preds[:, 0]
        valence_lstm = lstm_preds[:, 1]
        plot_arousal_valence(timestamps, gru_predictions_over_time, lstm_predictions_over_time, NEW_AUDIO_PATH)
        plot_point_arousal_valence(arousal_gru, valence_gru, arousal_lstm, valence_lstm, music_name=MUSIC_NAME)
        plot_combined_radar_by_percentage(arousal_gru, valence_gru, arousal_lstm, valence_lstm, music_name=MUSIC_NAME)

Carregando áudio...
Áudio carregado: Duração=234.77 segundos
Analisando em segmentos de 5.0 segundos...
