In [15]:
import os
import glob
from pathlib import Path

import librosa
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import pickle

SAMPLE_RATE = 44100
HOP_LENGTH = 256         
N_FFT = 1024          
N_MFCC = 13                 

def extract_mfcc(y) -> np.ndarray:
    
    y, _ = librosa.load(y, sr=SAMPLE_RATE)
    
    mfcc = librosa.feature.mfcc(
        y=y,
        sr=SAMPLE_RATE,
        n_mfcc=N_MFCC,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        center=True)
    delta = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    feats = np.vstack([mfcc, delta, delta2])
    return aggregate_stats(feats)

def aggregate_stats(feats: np.ndarray) -> np.ndarray:
    out = []
    for row in feats:
        vals = np.asarray(row, dtype=np.float32)
        out.extend([
            np.mean(vals),
            np.std(vals),
            np.median(vals),
            np.max(vals) - np.min(vals)
        ])
    return np.asarray(out, dtype=np.float32)


In [2]:
from sklearn.model_selection import train_test_split

in_path_music = os.path.abspath("mixed_up_data_talk_segmented/music")

music_files = [f for f in os.listdir(in_path_music)]

df_music = pd.DataFrame({
    "mfcc_coeff": [extract_mfcc(os.path.join(in_path_music, f)) for f in tqdm(music_files, desc="Estrazione MFCC da music_files")],
    "label":      0
})

Estrazione MFCC da music_files:   0%|          | 0/932 [00:00<?, ?it/s]

In [3]:
in_path_noise = os.path.abspath("mixed_up_data_talk_segmented/noisy")
noise_files = [f for f in os.listdir(os.path.abspath("mixed_up_data_talk_segmented/noisy"))]

df_noisy = pd.DataFrame({
    "mfcc_coeff": [extract_mfcc(os.path.join(in_path_noise, f)) for f in tqdm(noise_files, desc="Estrazione MFCC da noise_files")],
    "label":      1
})

train = pd.concat([df_music, df_noisy], ignore_index=True)

Estrazione MFCC da noise_files:   0%|          | 0/1032 [00:00<?, ?it/s]

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

model = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=1000)

X = np.vstack(train["mfcc_coeff"].values)   # matrice sample - coefficient
y = train["label"].values                   # array 1D di etichette

X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model.fit(X_train, y_train)
print("Fine dell'addestramento")

Fine dell'addestramento


In [13]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       186
           1       0.98      1.00      0.99       207

    accuracy                           0.99       393
   macro avg       0.99      0.99      0.99       393
weighted avg       0.99      0.99      0.99       393



In [8]:
def predict_noisy_probability(wav_path):
    emb = extract_mfcc(wav_path).reshape(1, -1)
    emb = scaler.transform(emb)
    probs = model.predict_proba(emb)[0]   # array di lunghezza 2
    return probs

test_files = [
    "audio_test/music_pure.wav",
    "audio_test/noise_pure.wav",
    "audio_test/voice_base_music.wav",
    "audio_test/voice_base_pure.wav",
    "audio_test/voice_base_noise.wav"
]
for test_file in test_files:
    if os.path.exists(test_file):
        p_clean, p_noisy = predict_noisy_probability(test_file)
        print(f"{test_file} → music: {p_clean:.3f}, noisy: {p_noisy:.3f}")
    else:
        print(f"File di test non trovato: {test_file}")

audio_test/music_pure.wav → music: 1.000, noisy: 0.000
audio_test/noise_pure.wav → music: 0.421, noisy: 0.579
audio_test/voice_base_music.wav → music: 1.000, noisy: 0.000
audio_test/voice_base_pure.wav → music: 1.000, noisy: 0.000
audio_test/voice_base_noise.wav → music: 0.907, noisy: 0.093


# Breve Descrizione
I ragionamenti sono simili al caso di mfcc_no_talk, anzi, sono equipollenti: in questo caso entrambe le fonti audio hanno componenti vocali, rispettivamente con musica e noise in background, MFCC aiuta a capire che la componente vocale ci sia ma non è robusto a distinguere se il background sia noisy o musica per discorsi analoghi a quelli già fatti

In [26]:
import numpy as np
import librosa
from sklearn.linear_model import LogisticRegression
from collections import Counter
from typing import List

WINDOW_SECONDS = 3

def aggregate_stats(feats: np.ndarray) -> np.ndarray:
    
    out = []
    for row in feats:
        vals = row.astype(np.float32)
        out.extend([
            np.mean(vals),
            np.std(vals),
            np.median(vals),
            np.max(vals) - np.min(vals)
        ])
    return np.asarray(out, dtype=np.float32)

def extract_mfcc_from_signal(y: np.ndarray) -> np.ndarray:
    
    mfcc    = librosa.feature.mfcc(y=y, sr=SAMPLE_RATE,
                                   n_mfcc=N_MFCC,
                                   n_fft=N_FFT,
                                   hop_length=HOP_LENGTH,
                                   center=True)
    delta1  = librosa.feature.delta(mfcc, order=1)
    delta2  = librosa.feature.delta(mfcc, order=2)
    feats   = np.vstack([mfcc, delta1, delta2])
    return aggregate_stats(feats)

def segment_audio(path: str, window_sec: float = WINDOW_SECONDS) -> List[np.ndarray]:

    y, _ = librosa.load(path, sr=SAMPLE_RATE)
    win_len = int(window_sec * SAMPLE_RATE)
    n_segs  = int(np.ceil(len(y) / win_len))
    segments = []
    for i in range(n_segs):
        start = i * win_len
        end   = start + win_len
        seg   = y[start:end]
        if len(seg) < win_len:
            seg = np.pad(seg, (0, win_len - len(seg)), mode='constant')
        segments.append(seg)
    return segments

def extract_features_per_segment(path: str) -> np.ndarray:

    segments = segment_audio(path)
    feats = [extract_mfcc_from_signal(seg) for seg in segments]
    return np.vstack(feats)

def classify_segments(path: str, model: LogisticRegression) -> List[int]:
 
    X = extract_features_per_segment(path)
    return model.predict(X).tolist()

def majority_vote(preds: List[int]) -> int:

    cnt = Counter(preds)
    return cnt.most_common(1)[0][0]

def global_decision_majority(path: str, model: LogisticRegression) -> int:
 
    seg_preds = classify_segments(path, model)
    return majority_vote(seg_preds)

if __name__ == "__main__":

    test_files = [
    "audio_test/music_pure.wav",
    "audio_test/noise_pure.wav",
    "audio_test/voice_base_music.wav",
    "audio_test/voice_base_pure.wav",
    "audio_test/voice_base_noise.wav"
    ]

    for audio_file in test_files:
        segment_preds = classify_segments(audio_file, model)

        base, _ = os.path.splitext(audio_file)
        global_pred = global_decision_majority(audio_file, model)
        print(f"\nDecisione per {base} (majority vote): {global_pred}")


Decisione per audio_test/music_pure (majority vote): 0

Decisione per audio_test/noise_pure (majority vote): 0

Decisione per audio_test/voice_base_music (majority vote): 0

Decisione per audio_test/voice_base_pure (majority vote): 0

Decisione per audio_test/voice_base_noise (majority vote): 0
