# Lab09 - Classification Challenge

In [None]:
!wget -O free-spoken-digit.zip "https://github.com/dbdmg/data-science-lab/raw/master/datasets/free-spoken-digit.zip"
!unzip -o free-spoken-digit.zip

In [None]:
import os
print("Contenuti root directory:")
for item in os.listdir('.'):
    print(f"  {item}")

print("\nContenuti cartella dev:")
for item in os.listdir('dev'):
    print(f"  dev/{item}")

print("\nContenuti cartella eval:")
for item in os.listdir('eval'):
    print(f"  eval/{item}")

In [3]:
import os
import numpy as np
import pandas as pd
from scipy.io import wavfile

def load_folder(folder, with_label):
    rows, lengths = [], []
    for fname in os.listdir(folder):
        parts = fname[:-4].split('_')
        file_id = int(parts[0])
        label = int(parts[1]) if with_label else None

        sr, x = wavfile.read(os.path.join(folder, fname))
        x = x.astype(np.float32)
        if np.max(np.abs(x)) > 0:
            x = x / np.max(np.abs(x))

        rows.append((file_id, label, x, len(x)))
        lengths.append(len(x))

    max_len = max(lengths)

    padded_signals = []
    for file_id, label, x, length in rows:
        padded_x = np.pad(x, (0, max_len - length)) 
        padded_signals.append(padded_x)
        
    X = np.stack(padded_signals)
    ids = [r[0] for r in rows]
    labels = [r[1] for r in rows] if with_label else None

    return ids, labels, X


dev_ids, dev_labels, dev_X = load_folder('dev', with_label=True)
eval_ids, _, eval_X = load_folder('eval', with_label=False)

dev_df = pd.DataFrame(dev_X)
dev_df['label'] = dev_labels
dev_df.insert(0, 'file_id', dev_ids)

eval_df = pd.DataFrame(eval_X)
eval_df.insert(0, 'file_id', eval_ids)

print("dev_df:", dev_df.shape, "eval_df:", eval_df.shape)

dev_df: (1500, 17569) eval_df: (500, 18263)


In [4]:
sr = 8000  

def spectral_features_bands(x, sr):
    N = len(x)
    X = np.fft.rfft(x)
    mag = np.abs(X) + 1e-12
    freqs = np.fft.rfftfreq(N, 1/sr)
    w = mag / mag.sum()

    energy = np.sum(mag**2)
    peak_idx = np.argmax(mag)
    peak_freq = freqs[peak_idx]
    peak_mag = mag[peak_idx]
    mag_mean = mag.mean()
    mag_std  = mag.std()
    centroid = np.sum(freqs * w)
    spread   = np.sqrt(np.sum(((freqs - centroid)**2) * w))
    flatness = np.exp(np.mean(np.log(mag))) / (mag_mean + 1e-12)

    cum = np.cumsum(mag**2) / np.sum(mag**2)
    rolloff = freqs[np.searchsorted(cum, 0.85)]

    b0 = mag[freqs <= 300]
    b1 = mag[(freqs > 300) & (freqs <= 800)]
    b2 = mag[(freqs > 800) & (freqs <= 1500)]
    b3 = mag[freqs > 1500]

    b0_e = np.sum(b0**2)
    b1_e = np.sum(b1**2)
    b2_e = np.sum(b2**2)
    b3_e = np.sum(b3**2) + 1e-12

    b0_ratio = b0_e / (energy + 1e-12)
    b1_ratio = b1_e / (energy + 1e-12)
    b2_ratio = b2_e / (energy + 1e-12)
    b3_ratio = b3_e / (energy + 1e-12)

    return np.array([
        energy, peak_freq, peak_mag, mag_mean, mag_std,
        centroid, spread, flatness, rolloff,
        b0_ratio, b1_ratio, b2_ratio, b3_ratio
    ])

feature_names = [
    'spec_energy','spec_peak_freq','spec_peak_mag','spec_mag_mean','spec_mag_std',
    'spec_centroid','spec_spread','spec_flatness','spec_rolloff',
    'band0_ratio','band1_ratio','band2_ratio','band3_ratio'
]

dev_features  = np.vstack([spectral_features_bands(x, sr) for x in dev_X])
eval_features = np.vstack([spectral_features_bands(x, sr) for x in eval_X])

dev_df_freq = pd.DataFrame(dev_features, columns=feature_names)
dev_df_freq.insert(0, 'file_id', dev_ids)
dev_df_freq['label'] = dev_labels

eval_df_freq = pd.DataFrame(eval_features, columns=feature_names)
eval_df_freq.insert(0, 'file_id', eval_ids)

print(dev_df_freq.shape, eval_df_freq.shape)

(1500, 15) (500, 14)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import pandas as pd

X_train = dev_df_freq.drop(columns=['file_id', 'label']).values
y_train = dev_df_freq['label'].values
X_test  = eval_df_freq.drop(columns=['file_id']).values
test_ids = eval_df_freq['file_id'].values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid = {
    'rf__n_estimators':      [200, 400],
    'rf__max_depth':         [8, 12, 16],
    'rf__min_samples_leaf':  [3, 5, 8],
    'rf__min_samples_split': [5, 10, 20],
    'rf__max_features':      ['sqrt', 0.3],
}


grid = GridSearchCV(
    pipe,
    param_grid,
    cv=kf,
    scoring='f1_macro',
    n_jobs=-1
)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
cv_f1 = grid.best_score_
train_f1 = f1_score(y_train, best_model.predict(X_train), average='macro')

print("Best params:", grid.best_params_)
print("CV F1_macro:", cv_f1)
print("Train F1_macro:", train_f1)

y_test_pred = best_model.predict(X_test)

submission_df = (
    pd.DataFrame({'Id': test_ids, 'Predicted': y_test_pred})
      .sort_values('Id')
      .reset_index(drop=True)
)

print(submission_df.head())

Best params: {'rf__max_depth': 16, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 5, 'rf__n_estimators': 200}
CV F1_macro: 0.780796128518815
Train F1_macro: 0.9759716664769087
   Id  Predicted
0   0          0
1   1          9
2   2          3
3   3          9
4   4          3


In [6]:
submission_df.to_csv('submission.csv', index=False)

print("submission.csv pronto")
print(submission_df.head())
print(f"Shape: {submission_df.shape}")

submission.csv pronto
   Id  Predicted
0   0          0
1   1          9
2   2          3
3   3          9
4   4          3
Shape: (500, 2)
