# Lab09 - Classification Challenge

In [1]:
!wget -O free-spoken-digit.zip "https://github.com/dbdmg/data-science-lab/raw/master/datasets/free-spoken-digit.zip"
!unzip -o free-spoken-digit.zip

--2025-12-04 13:23:51--  https://github.com/dbdmg/data-science-lab/raw/master/datasets/free-spoken-digit.zip
Risoluzione di github.com (github.com)... 140.82.121.4
Connessione a github.com (github.com)|140.82.121.4|:443... connesso.
Richiesta HTTP inviata, in attesa di risposta... 302 Found
Posizione: https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/free-spoken-digit.zip [segue]
--2025-12-04 13:23:52--  https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/free-spoken-digit.zip
Risoluzione di raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connessione a raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connesso.
Richiesta HTTP inviata, in attesa di risposta... 200 OK
Lunghezza: 9315920 (8,9M) [application/zip]
Salvataggio in: «free-spoken-digit.zip»


2025-12-04 13:23:56 (3,84 MB/s) - «free-spoken-digit.zip» salvato [9315920/9315920]

Archive:  free-spo

In [2]:
import os
print("Contenuti root directory:")
for item in os.listdir('.'):
    print(f"  {item}")

print("\nContenuti cartella dev:")
for item in os.listdir('dev'):
    print(f"  dev/{item}")

print("\nContenuti cartella eval:")
for item in os.listdir('eval'):
    print(f"  eval/{item}")

Contenuti root directory:
  free-spoken-digit.zip
  submission.csv
  sample_eval_submission.csv
  dev
  .ipynb_checkpoints
  eval
  Lab09.ipynb

Contenuti cartella dev:
  dev/481_8.wav
  dev/863_1.wav
  dev/845_2.wav
  dev/1146_3.wav
  dev/307_2.wav
  dev/1281_5.wav
  dev/223_6.wav
  dev/1468_4.wav
  dev/793_2.wav
  dev/939_3.wav
  dev/657_3.wav
  dev/1300_4.wav
  dev/1138_5.wav
  dev/755_4.wav
  dev/318_5.wav
  dev/872_9.wav
  dev/435_5.wav
  dev/554_1.wav
  dev/413_6.wav
  dev/242_7.wav
  dev/734_5.wav
  dev/625_8.wav
  dev/549_4.wav
  dev/458_9.wav
  dev/1200_1.wav
  dev/1235_8.wav
  dev/1389_9.wav
  dev/135_8.wav
  dev/450_0.wav
  dev/819_3.wav
  dev/360_5.wav
  dev/271_8.wav
  dev/675_4.wav
  dev/1367_3.wav
  dev/1374_9.wav
  dev/616_7.wav
  dev/922_0.wav
  dev/820_7.wav
  dev/576_6.wav
  dev/474_1.wav
  dev/76_0.wav
  dev/301_4.wav
  dev/919_6.wav
  dev/1385_4.wav
  dev/327_7.wav
  dev/280_5.wav
  dev/1222_6.wav
  dev/730_1.wav
  dev/1343_3.wav
  dev/1320_0.wav
  dev/1287_2.wav
 

In [3]:
import pandas as pd
import numpy as np
from scipy.io import wavfile
import os

def load_simple_df(folder_path, is_dev):
    data = []
    for fname in os.listdir(folder_path):
        if fname.endswith('.wav'):
            fs, signal = wavfile.read(os.path.join(folder_path, fname))
            
            if is_dev:
                label = fname.split('_')[1].split('.')[0]
                data.append({'signal': signal, 'label': label})
            else:
                data.append({'signal': signal})
    
    return pd.DataFrame(data)

dev_df = load_simple_df('dev', True)
eval_df = load_simple_df('eval', False)

print(f"Dev Shape: {dev_df.shape}")
print(f"Eval Shape: {eval_df.shape}")

Dev Shape: (1500, 2)
Eval Shape: (500, 1)


In [4]:
from scipy.stats import skew, kurtosis

def extract_time_features_robust(signal):
    signal_f = signal.astype(np.float64)
    n_chunks = 4
    chunk_size = max(1, len(signal_f) // n_chunks)
    features = {}
    
    for i in range(n_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(signal_f))
        chunk = signal_f[start:end]

        features[f'Mean{i}'] = np.mean(chunk)
        features[f'Std{i}'] = np.std(chunk)
        features[f'Skew{i}'] = skew(chunk)
        features[f'Kurt{i}'] = kurtosis(chunk)
    
    return features

def prepare_features_df_robust(df):
    feats = [extract_time_features_robust(sig) for sig in df['signal']]
    X_df = pd.DataFrame(feats)
    if 'label' in df.columns:
        X_df['label'] = df['label'].astype(int)
    return X_df

X_dev_df = prepare_features_df_robust(dev_df)
X_eval_df = prepare_features_df_robust(eval_df)
print(f"Dev shape: {X_dev_df.shape}")
print(f"Eval shape: {X_eval_df.shape}")

Dev shape: (1500, 17)
Eval shape: (500, 16)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler  

X_train = X_dev_df.drop('label', axis=1).values  
y_train = X_dev_df['label'].values
X_eval = X_eval_df.values

scaler = StandardScaler()  
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled = scaler.transform(X_eval)

param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [12, 18, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1]
}

rf = RandomForestClassifier(random_state=42, class_weight='balanced')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print(f"F1-score: {grid_search.best_score_:.4f}")  

y_eval_pred = grid_search.predict(X_eval_scaled)
submission_df = pd.DataFrame({'Id': range(len(y_eval_pred)), 'Predicted': y_eval_pred})

F1-score: 0.7139


In [6]:
submission_df.to_csv('submission.csv', index=False)

print("\n✓ submission.csv pronto")
print(submission_df.head())
print(f"Shape: {submission_df.shape}")


✓ submission.csv pronto
   Id  Predicted
0   0          5
1   1          2
2   2          7
3   3          8
4   4          5
Shape: (500, 2)
