In [4]:
import os
import numpy as np
import librosa
import tensorflow as tf

from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import load_model
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.utils import Sequence
from sklearn.model_selection import train_test_split
from tensorflow.keras import callbacks, models

In [None]:
from google.colab import drive

drive.mount('/content/drive')

zip_drive_path = '/content/drive/MyDrive/mixed_up_data_talk_segmented.zip'

zip_local_path = '/content/mixed_up_data_talk_segmented.zip'

destination_folder = '/content/datasets/'
import os
os.makedirs(destination_folder, exist_ok=True)

import zipfile
with zipfile.ZipFile(zip_local_path, 'r') as zip_ref:
    zip_ref.extractall(destination_folder)

In [None]:
DATA_DIR = "/content/datasets/mixed_up_data_talk_segmented"

SR = 16000                # Frequenza di campionamento (Hz)
DURATION = 3.0            # Durata di ciascun clip in secondi
N_MELS = 128              # Numero di bande Mel
N_FFT = 512               # Dimensione finestra FFT
HOP_LENGTH = 160          # Hop di 10 ms (160 campioni a 16 kHz)
WIN_LENGTH = 400          # Window di 25 ms (400 campioni a 16 kHz)


TARGET_FRAMES = int(np.ceil((DURATION * SR - WIN_LENGTH) / HOP_LENGTH)) + 1

BATCH_SIZE = 32
EPOCHS = 50

def load_and_normalize(path, sr=SR, duration=DURATION):
    
    y, _ = librosa.load(path, sr=sr, duration=duration)
    required_length = int(sr * duration)
    if len(y) < required_length:
        y = np.pad(y, (0, required_length - len(y)))
    else:
        y = y[:required_length]
    max_val = np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1.0
    y = y / max_val
    return y

def compute_log_mel_spectrogram(
    y,
    sr=SR,
    n_mels=N_MELS,
    n_fft=N_FFT,
    hop_length=HOP_LENGTH,
    win_length=WIN_LENGTH
):

    S = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        n_mels=n_mels,
        power=2.0
    )

    log_mel_S = librosa.power_to_db(S, ref=np.max)

    if log_mel_S.shape[1] < TARGET_FRAMES:
        pad_width = TARGET_FRAMES - log_mel_S.shape[1]
        log_mel_S = np.pad(
            log_mel_S,
            ((0, 0), (0, pad_width)),
            mode='constant',
            constant_values=log_mel_S.min()
        )
    else:
        log_mel_S = log_mel_S[:, :TARGET_FRAMES]

    return log_mel_S

class AudioDataset(Sequence):
    def __init__(self, file_paths, labels, batch_size=BATCH_SIZE, is_training=True):
        """
        file_paths: lista di percorsi ai file audio
        labels: lista di label intere (0 o 1) corrispondenti ai file
        """
        self.file_paths = file_paths
        self.labels = labels
        self.batch_size = batch_size
        self.is_training = is_training
        self.indices = np.arange(len(file_paths))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.file_paths) / self.batch_size))

    def on_epoch_end(self):
        if self.is_training:
            np.random.shuffle(self.indices)

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        X_batch = []
        y_batch = []

        for i in batch_indices:
            file_path = self.file_paths[i]
            lbl = self.labels[i]
            y = load_and_normalize(file_path)

            logmel = compute_log_mel_spectrogram(y)

            mean = np.mean(logmel)
            std = np.std(logmel) if np.std(logmel) > 0 else 1.0
            logmel = (logmel - mean) / std

            logmel = logmel[..., np.newaxis]

            X_batch.append(logmel)
            y_batch.append(lbl)

        X_batch = np.array(X_batch, dtype=np.float32)
        y_batch = np.array(y_batch, dtype=np.int32)
        return X_batch, y_batch

def gather_file_paths_and_labels(base_dir):
    
    file_paths = []
    labels = []
    for label_dir, label_value in [("noisy", 0), ("music", 1)]:
        dir_path = os.path.join(base_dir, label_dir)
        if not os.path.isdir(dir_path):
            continue
        for fname in os.listdir(dir_path):
            if fname.lower().endswith((".wav", ".mp3", ".flac")):
                full_path = os.path.join(dir_path, fname)
                file_paths.append(full_path)
                labels.append(label_value)
    return file_paths, labels

all_files, all_labels = gather_file_paths_and_labels(DATA_DIR)

if len(all_files) == 0:
    raise ValueError(f"Nessun file trovato in {DATA_DIR}/noisy e {DATA_DIR}/music")

files_tmp, test_files, labels_tmp, test_labels = train_test_split(
    all_files, all_labels,
    test_size=0.15,
    stratify=all_labels,
    random_state=42
)

train_files, valid_files, train_labels, valid_labels = train_test_split(
    files_tmp, labels_tmp,
    test_size=0.15/0.85,
    stratify=labels_tmp,
    random_state=42
)

train_dataset = AudioDataset(train_files, train_labels, batch_size=BATCH_SIZE, is_training=True)
valid_dataset = AudioDataset(valid_files, valid_labels, batch_size=BATCH_SIZE, is_training=False)
test_dataset  = AudioDataset(test_files, test_labels, batch_size=BATCH_SIZE, is_training=False)

def build_custom_cnn_2d(input_shape=(N_MELS, TARGET_FRAMES, 1), num_classes=2):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv2D(32, (3, 3), padding='same', activation=None)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPool2D((2, 2))(x)

    x = layers.Conv2D(64, (3, 3), padding='same', activation=None)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPool2D((2, 2))(x)

    x = layers.Conv2D(128, (3, 3), padding='same', activation=None)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPool2D((2, 2))(x)

    x = layers.Conv2D(256, (3, 3), padding='same', activation=None)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPool2D((2, 2))(x)

    x = layers.GlobalAveragePooling2D(name="global_average_pooling2d")(x)

    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
model = build_custom_cnn_2d()
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=8,
    restore_best_weights=True
)
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-6
)
checkpoint = callbacks.ModelCheckpoint(
    "best_audio_classifier.h5",
    save_best_only=True,
    monitor='val_loss'
)

history = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=EPOCHS,
    callbacks=[early_stopping, reduce_lr, checkpoint]
)

test_loss, test_acc = model.evaluate(test_dataset)
print(f"Test accuracy: {test_acc:.4f}")

feature_extractor = models.Model(
    inputs=model.input,
    outputs=model.get_layer("global_average_pooling2d").output
)

all_test_features = feature_extractor.predict(test_dataset)

all_test_labels = np.concatenate(
    [y_batch.numpy() for _, y_batch in test_dataset],
    axis=0
)

np.save("test_features.npy", all_test_features)
np.save("test_labels.npy", all_test_labels)

print("Estrazione feature completata. File salvati: 'test_features.npy' e 'test_labels.npy'.")

In [1]:
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import load_model

MODEL_PATH = "cnn_network.h5"       # modello salvato
TEST_DIR   = "mixed_up_data_talk_segmented"            # cartella con il test set
IMG_SIZE   = (224, 224)             # dimensione input della rete
BATCH_SIZE = 32                     # batch di test

model = load_model(MODEL_PATH)
print(f"âœ… Modello caricato da Â«{MODEL_PATH}Â»")

# 2) Costruisci il dataset di test
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    TEST_DIR,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False            # niente shuffle: ci serve lâ€™ordine per le metriche
)
class_names = test_ds.class_names
num_classes = len(class_names)
print(f"ðŸ”Ž Classi rilevate: {class_names}")

loss, acc = model.evaluate(test_ds, verbose=0)
print(f"\nðŸ“Š Risultati Test â€” Loss: {loss:.4f}  |  Accuracy: {acc:.2%}")

y_true = np.concatenate([y for _, y in test_ds])
y_pred_proba = model.predict(test_ds, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

print("\n=== Classification report ===")
print(classification_report(y_true, y_pred, target_names=class_names))



âœ… Modello caricato da Â«cnn_network.h5Â»


NameError: name 'tf' is not defined

In [8]:
"""
Valutazione del modello cnn_network.h5 su un test set audio
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
âœ“ Usa le stesse identiche funzioni di preprocessing viste in fase di training
âœ“ Calcola loss, accuracy, classification report e matrice di confusione
"""

# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ IMPORT â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import Sequence
from sklearn.metrics import classification_report, confusion_matrix
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ COSTANTI â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
MODEL_PATH = "cnn_network.h5"                          # modello salvato
TEST_DIR   = "mixed_up_data_talk_segmented"  # root con noisy/ e music/
SR          = 16000      # Hz
DURATION    = 3.0        # s
N_MELS      = 128
N_FFT       = 512
HOP_LENGTH  = 160        # 10 ms
WIN_LENGTH  = 400        # 25 ms
TARGET_FRAMES = int(np.ceil((DURATION * SR - WIN_LENGTH) / HOP_LENGTH)) + 1
BATCH_SIZE  = 32
CLASS_MAP   = {"noisy": 0, "music": 1}                 # ordine esplicito

# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ FUNZIONI DI PREPROCESSING â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def load_and_normalize(path, sr=SR, duration=DURATION):
    y, _ = librosa.load(path, sr=sr, duration=duration)
    required = int(sr * duration)
    y = np.pad(y, (0, max(0, required - len(y))))[:required]
    max_amp = np.max(np.abs(y)) or 1.0
    return y / max_amp

def compute_log_mel_spectrogram(y):
    S = librosa.feature.melspectrogram(
        y=y, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH,
        win_length=WIN_LENGTH, n_mels=N_MELS, power=2.0
    )
    log_S = librosa.power_to_db(S, ref=np.max)
    # pad / tronca a TARGET_FRAMES
    if log_S.shape[1] < TARGET_FRAMES:
        pad = TARGET_FRAMES - log_S.shape[1]
        log_S = np.pad(log_S, ((0, 0), (0, pad)), mode='constant',
                       constant_values=log_S.min())
    else:
        log_S = log_S[:, :TARGET_FRAMES]
    return log_S

# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ SEQUENCE KERAS â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
class AudioDataset(Sequence):
    def __init__(self, file_paths, labels, batch_size=BATCH_SIZE):
        self.file_paths = file_paths
        self.labels = labels
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.file_paths) / self.batch_size))

    def __getitem__(self, idx):
        idxs = range(idx * self.batch_size,
                     min(len(self.file_paths), (idx + 1) * self.batch_size))
        X, y = [], []
        for i in idxs:
            audio = load_and_normalize(self.file_paths[i])
            logmel = compute_log_mel_spectrogram(audio)
            # z-score clip-wise
            mu, sigma = logmel.mean(), logmel.std() or 1.0
            logmel = (logmel - mu) / sigma
            X.append(logmel[..., np.newaxis])           # â†’ (H, W, 1)
            y.append(self.labels[i])
        return np.array(X, dtype=np.float32), np.array(y, dtype=np.int32)

# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ RACCOLTA PERCORSI & LABELS â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def gather_files_and_labels(base_dir):
    paths, lbls = [], []
    for cls_name, cls_idx in CLASS_MAP.items():
        cls_dir = os.path.join(base_dir, cls_name)
        if not os.path.isdir(cls_dir):
            continue
        for f in os.listdir(cls_dir):
            if f.lower().endswith((".wav", ".mp3", ".flac")):
                paths.append(os.path.join(cls_dir, f))
                lbls.append(cls_idx)
    return paths, lbls

test_files, test_labels = gather_files_and_labels(TEST_DIR)
test_dataset = AudioDataset(test_files, test_labels, batch_size=BATCH_SIZE)

print(f"ðŸ”Ž File di test trovati: {len(test_files)}")

# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ CARICA & VALUTA â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
model = load_model(MODEL_PATH)
print(f"âœ… Modello caricato da Â«{MODEL_PATH}Â»")

loss, acc = model.evaluate(test_dataset, verbose=0)
print(f"\nðŸ“Š Test â€” Loss: {loss:.4f}  |  Accuracy: {acc:.2%}")

# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ METRICHE DETTAGLIATE â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
y_true = np.concatenate([y for _, y in test_dataset])
y_pred_prob = model.predict(test_dataset, verbose=1)
y_pred = np.argmax(y_pred_prob, axis=1)

class_names = list(CLASS_MAP.keys())
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, target_names=class_names, digits=4))



ðŸ”Ž File di test trovati: 3739
âœ… Modello caricato da Â«cnn_network.h5Â»

ðŸ“Š Test â€” Loss: 0.0063  |  Accuracy: 99.71%
