In [8]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import librosa
import sklearn
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
plt.style.use("seaborn-v0_8")
import os
import soundfile as sf

In [9]:
fn_csv = 'scenes_stereo\dcase2013_task1_filenamekey.csv'
df = pd.read_csv(fn_csv)
files = []
labels = []
files = df['decodedname']
labels = df['label']
print(len(files))
print(len(labels))
audio_path = 'scenes_stereo/scenes_stereo/'

200
200


In [10]:
files_train, files_test, labels_train, labels_test = train_test_split(files, labels, test_size=50, random_state=42, shuffle=True)
# specificando il random_state ad ogni run ci sarà sempre lo stesso split (riproducibilità)

print(len(files_train))  # 150
print(len(files_test))   # 50
print(len(labels_train))  # 150
print(len(labels_test))   # 50

150
50
150
50


In [94]:
import librosa

augmented_data = []
fast_files = []
slow_files = []

for i, file in enumerate(files_train):
    signal, sr = librosa.load(audio_path + file)
    label = labels_train.iloc[i]
    base_name = os.path.splitext(file)[0]  # es: 'bus12.wav' → 'bus12'

    # Aggiungi il segnale originale
    augmented_data.append((signal, label, f"{base_name}_orig.wav"))

    # Crea versioni augmentate
    versions = {
        'stretch_fast': librosa.effects.time_stretch(signal, rate=1.2),
        'stretch_slow': librosa.effects.time_stretch(signal, rate=0.8),
        'pitch_up': librosa.effects.pitch_shift(signal, sr=sr, n_steps=2),
        'pitch_down': librosa.effects.pitch_shift(signal, sr=sr, n_steps=-2)
    }

    # Aggiungi i segnali augmentati alla lista
    for suffix, aug_signal in versions.items():
        filename = f"{base_name}_{suffix}.wav"
        augmented_data.append((aug_signal, label, filename))
        if "stretch_fast" in suffix:
            fast_files.append(filename)
        elif "stretch_slow" in suffix:
            slow_files.append(filename)

In [95]:
print(len(fast_files))
print(len(slow_files))

150
150


In [110]:
fs = 22050 # solo per assicurarsi che altre funzioni non la cambino

In [98]:
import os
import soundfile as sf

augmented_path = 'scenes_stereo_augmented/'
test_path = 'scenes_stereo_test/'
os.makedirs(augmented_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

files_train_aug = []
labels_train_aug = []

for i, (signal, label, filename) in enumerate(augmented_data):
    filepath = os.path.join(augmented_path, filename)
    if filename in fast_files:
        signal_resampled = librosa.resample(signal, orig_sr=fs, target_sr=int(fs*1.2))
        augmented_data[i] = (signal_resampled, label, filename)
        sf.write(filepath, signal_resampled, int(fs*1.2))
    elif filename in slow_files:
        signal_resampled = librosa.resample(signal, orig_sr=fs, target_sr=int(fs*0.8))
        augmented_data[i] = (signal_resampled, label, filename)
        sf.write(filepath, signal_resampled, int(fs*0.8))
    else:
        sf.write(filepath, signal, fs)

    files_train_aug.append(filename)
    labels_train_aug.append(label)

# ho resamplato i file stretchati in modo che avessero lo stesso numero di campioni degli altri

In [99]:
print(len(files_train_aug))
print(len(files_test))
print(len(labels_train_aug))
print(len(labels_test))

750
50
750
50


In [100]:
signals_fs = [librosa.load(audio_path + file) for file in files_test]
X_test = [s[0] for s in signals_fs]
print(len(X_test))
y_test = labels_test
print(len(y_test))

# test è ok, con X_test (segnali) e y_test (label)

50
50


In [101]:
X_train = [s for s, l, f in augmented_data]
y_train = [l for s, l, f in augmented_data]

In [102]:
print(len(X_train))
print(len(y_train))

# train è ok, con X_train (segnali) e y_train (label)

750
750


In [103]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [104]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# (750, 661500)
# (750, )
# (50, 661500)
# (50, )

(750, 661500)
(750,)
(50, 661500)
(50,)


In [89]:
fs = 22050 # non fa mai male ridichiararla
time_axis = np.arange(len(signals[0]))/fs

In [None]:
'''

for i in range(50):
    plt.figure(figsize = (10, 5))
    plt.xlabel('time (s)')
    plt.plot(time_axis, X_test[i])
    plt.show()
    print(y_test[i])

'''

In [91]:
w = 512
h = 256
n = w

In [105]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [150]:
def feature_extraction(x, fs, n, h):
    zcr = librosa.feature.zero_crossing_rate(x, hop_length = h)
    mean_zcr = np.mean(zcr)
    std_zcr = np.std(zcr)
    melspec = librosa.power_to_db(librosa.feature.melspectrogram(y=x, n_fft=n, hop_length=h, n_mels=40))
    delta_melspec = librosa.feature.delta(melspec)
    delta2_melspec = librosa.feature.delta(delta_melspec)
    mean_mel = np.mean(melspec)
    std_mel = np.std(melspec)
    mean_deltamel = np.mean(delta_melspec)
    std_deltamel = np.std(delta_melspec)
    mean_delta2mel = np.mean(delta2_melspec)
    std_delta2mel = np.std(delta2_melspec)
    C = np.abs(librosa.stft(x, n_fft = n, hop_length = h))
    S = librosa.amplitude_to_db(C, ref = np.max)
    spectral_flux = librosa.onset.onset_strength(S = S, sr = fs)
    mfcc = librosa.feature.mfcc(y = x, sr = fs, n_fft = n, hop_length = h)
    mean_mfcc = np.mean(mfcc)
    std_mfcc = np.std(mfcc)
    mean_spectral_flux = np.mean(spectral_flux)
    std_spectral_flux = np.std(spectral_flux)
    
    f_vector = np.concatenate(([mean_zcr, std_zcr], [mean_mel, std_mel], [mean_deltamel, std_deltamel], [mean_delta2mel, std_delta2mel], [mean_spectral_flux, std_spectral_flux], [mean_mfcc, std_mfcc]))
    #f_vector = np.concatenate(([mean_zcr, std_zcr], [mean_mel, std_mel], [mean_deltamel, std_deltamel], [mean_delta2mel, std_delta2mel], [mean_spectral_flux, std_spectral_flux]))
    return f_vector

In [151]:
f_vector_train = np.zeros((X_train.shape[0], 12))
f_vector_test = np.zeros((X_test.shape[0], 12))
# se aggiungete/togliete feature dal f_vector ricordatevi di cambiare la dimensione

for i in range(0, X_train.shape[0]):
    f_vector_train[i, :] = feature_extraction(X_train[i, :], fs = fs, n = n, h = h)
for i in range(0, X_test.shape[0]):
    f_vector_test[i, :] = feature_extraction(X_test[i, :], fs = fs, n = n, h = h)

In [None]:
'''

print(f_vector_train.shape)
print(f_vector_train[0, :])
print(f_vector_train[100, :])
plt.plot(f_vector_train[0, :])

'''

In [153]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

mod = SVC(kernel = 'linear', C = 1000)
mod.fit(f_vector_train, y_train)
train_pred = mod.predict(f_vector_train)
accuracy = accuracy_score(y_train, train_pred)
print('train: ', accuracy)
test_pred = mod.predict(f_vector_test)
accuracy = accuracy_score(y_test, test_pred)
print('test: ', accuracy)

# overfitta!! togliendo mfcc dal feature_vector migliora, però forse il problema è che abbiamo troppi dati di train perché prima con solo 150 andava meglio

train:  0.8333333333333334
test:  0.64
