In [2]:
import sys 
import os
import pandas as pd
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))
from utils.utils import save_experiment, evaluate_model_on_parquet, grid_search_model
from configs.config_local import DATASET_PATH, ITW_DATASET_PATH, MODELS_PATH
import joblib


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

model = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC()),
])

param_grid = {
    "svm__kernel": ["rbf"],
    "svm__C": [0.1, 1, 10, 100, 1000],  # maybe stop at 100 for RBF
    "svm__gamma": ["scale", 0.01, 0.001],
    "svm__class_weight": [None, "balanced"],
}

train_data_path = os.path.join(DATASET_PATH, "training_features_40_512_256_128.parquet")
test_data_path = os.path.join(DATASET_PATH, "testing_features_40_512_256_128.parquet")



best_model, metrics, best_params_, metadata, feature_names = grid_search_model(
    model,
    param_grid,
    train_data_path,
    test_data_path,
    scoring= "f1_macro",
    cv = 5,
    n_jobs= 1,
    verbose = 2,
)

print(best_params_)
print(metrics)


save_experiment(
    model=best_model,
    metrics=metrics,
    experiment_dir=os.path.join(sys.path[0], "notebooks", "experiments",  "rbf_svm",),
    model_params=best_params_,
    feature_names=feature_names,
    metadata_extra=metadata,
)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END svm__C=0.1, svm__class_weight=None, svm__gamma=scale, svm__kernel=rbf; total time=   1.2s
[CV] END svm__C=0.1, svm__class_weight=None, svm__gamma=scale, svm__kernel=rbf; total time=   0.5s
[CV] END svm__C=0.1, svm__class_weight=None, svm__gamma=scale, svm__kernel=rbf; total time=   0.5s
[CV] END svm__C=0.1, svm__class_weight=None, svm__gamma=scale, svm__kernel=rbf; total time=   0.5s
[CV] END svm__C=0.1, svm__class_weight=None, svm__gamma=scale, svm__kernel=rbf; total time=   0.4s


Test Best Model to ITW

In [None]:
itw_test_data_path = os.path.join(ITW_DATASET_PATH, 'normalized_features',"itw_features_40_2048_512_128_trimmed.parquet")
metrics, metadata_extra = evaluate_model_on_parquet(best_model, itw_test_data_path)
print(metrics)

{'accuracy': 0.6232578812025779, 'precision': 0.4947026246087166, 'recall': 0.34884115799303844, 'f1': 0.5663124088573276}


In [4]:
itw_test_data_path = os.path.join(ITW_DATASET_PATH, 'normalized_features',"itw_features_40_2048_512_128_trimmed.parquet")
rbf_models_path = os.path.join(MODELS_PATH,'rbf_svm')
exp = find_best_trained_monel(rbf_models_path,'f1')
best_f1_model = joblib.load(os.path.join(MODELS_PATH,'rbf_svm',exp,'model.joblib'))


metrics, metadata_extra = evaluate_model_on_parquet(best_f1_model, itw_test_data_path)
print(metrics)

Best model according to f1: exp_20260117_234410, f1=0.8467561929764884
{'accuracy': 0.5555414457601828, 'precision': 0.4398461621797302, 'recall': 0.6893624246540453, 'f1': 0.5548302591620256}
