# Hyperparameter Optimization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sys
import os
sys.path.append(os.path.abspath('../modules'))
from optimizer import optimize_pipeline
from preproc import PhishingDatasetPreproc

In [2]:
np.random.seed(42)

In [3]:
prep = PhishingDatasetPreproc()
dataset, X, y = prep.basic_operations()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [4]:
best_trials = {}
model_names = ['KNN','DTR', 'SVM', 'RF', 'XGB', 'LGBM', 'MLP']
selectors = ['tree', 'pca', 'univariate', 'l1']

In [5]:
for model_name in model_names:
    print(model_name)
    best_trials[model_name] = optimize_pipeline(X_train, y_train, model_name, selectors)

[I 2024-11-21 03:02:15,267] A new study created in memory with name: optimization_KNN
[I 2024-11-21 03:02:15,451] Trial 0 finished with value: 0.8885544433222741 and parameters: {'fs_method': 'l1', 'selector__alpha': 0.19650156188774928, 'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'ball_tree', 'metric': 'manhattan'}. Best is trial 0 with value: 0.8885544433222741.


KNN


[I 2024-11-21 03:02:16,859] Trial 1 finished with value: 0.9290802589377967 and parameters: {'fs_method': 'tree', 'n_neighbors': 8, 'weights': 'distance', 'algorithm': 'auto', 'metric': 'chebyshev'}. Best is trial 1 with value: 0.9290802589377967.
[I 2024-11-21 03:02:18,875] Trial 2 finished with value: 0.9252214226454962 and parameters: {'fs_method': 'tree', 'n_neighbors': 6, 'weights': 'uniform', 'algorithm': 'brute', 'metric': 'euclidean'}. Best is trial 1 with value: 0.9290802589377967.
[I 2024-11-21 03:02:20,259] Trial 3 finished with value: 0.9276328045074896 and parameters: {'fs_method': 'tree', 'n_neighbors': 6, 'weights': 'distance', 'algorithm': 'kd_tree', 'metric': 'chebyshev'}. Best is trial 1 with value: 0.9290802589377967.
[I 2024-11-21 03:02:20,582] Trial 4 finished with value: 0.8739618166363826 and parameters: {'fs_method': 'pca', 'selector__n_components': 4, 'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'metric': 'manhattan'}. Best is trial 1 with 

Trial 19 failed: Found array with 0 feature(s) (shape=(6632, 0)) while a minimum of 1 is required by KNeighborsClassifier.


[I 2024-11-21 03:02:29,532] Trial 20 finished with value: 0.9605596116078472 and parameters: {'fs_method': 'univariate', 'selector__k': 30, 'n_neighbors': 4, 'weights': 'distance', 'algorithm': 'ball_tree', 'metric': 'manhattan'}. Best is trial 20 with value: 0.9605596116078472.
[I 2024-11-21 03:02:30,395] Trial 21 finished with value: 0.9605596116078472 and parameters: {'fs_method': 'univariate', 'selector__k': 30, 'n_neighbors': 4, 'weights': 'distance', 'algorithm': 'ball_tree', 'metric': 'manhattan'}. Best is trial 20 with value: 0.9605596116078472.
[I 2024-11-21 03:02:31,271] Trial 22 finished with value: 0.9600772479824563 and parameters: {'fs_method': 'univariate', 'selector__k': 30, 'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'metric': 'manhattan'}. Best is trial 20 with value: 0.9605596116078472.
[I 2024-11-21 03:02:32,113] Trial 23 finished with value: 0.9600772479824563 and parameters: {'fs_method': 'univariate', 'selector__k': 30, 'n_neighbors': 5, 'w

Trial 67 failed: Found array with 0 feature(s) (shape=(6632, 0)) while a minimum of 1 is required by KNeighborsClassifier.


[I 2024-11-21 03:03:07,393] Trial 68 finished with value: 0.9415023947310825 and parameters: {'fs_method': 'univariate', 'selector__k': 15, 'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'metric': 'manhattan'}. Best is trial 43 with value: 0.9609212752606501.
[I 2024-11-21 03:03:08,042] Trial 69 finished with value: 0.9579056664274482 and parameters: {'fs_method': 'univariate', 'selector__k': 23, 'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'metric': 'manhattan'}. Best is trial 43 with value: 0.9609212752606501.
[I 2024-11-21 03:03:09,403] Trial 70 finished with value: 0.9231698139548072 and parameters: {'fs_method': 'tree', 'n_neighbors': 4, 'weights': 'distance', 'algorithm': 'ball_tree', 'metric': 'manhattan'}. Best is trial 43 with value: 0.9609212752606501.
[I 2024-11-21 03:03:10,227] Trial 71 finished with value: 0.9610419025224113 and parameters: {'fs_method': 'univariate', 'selector__k': 29, 'n_neighbors': 4, 'weights': 'distance', 'algorith

Best trial:
Value: 0.9610419025224113
Params:
  fs_method: univariate
  selector__k: 29
  n_neighbors: 4
  weights: distance
  algorithm: ball_tree
  metric: manhattan
DTR


[I 2024-11-21 03:03:32,854] Trial 0 finished with value: 0.880834225858733 and parameters: {'fs_method': 'pca', 'selector__n_components': 20, 'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.880834225858733.
[I 2024-11-21 03:03:32,894] Trial 1 finished with value: 0.8002687392160756 and parameters: {'fs_method': 'univariate', 'selector__k': 30, 'criterion': 'gini', 'splitter': 'best', 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.880834225858733.
[I 2024-11-21 03:03:32,924] Trial 2 finished with value: 0.9282348501538925 and parameters: {'fs_method': 'univariate', 'selector__k': 6, 'criterion': 'gini', 'splitter': 'best', 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.9282348501538925.
[I 2024-11-21 03:03:33,072] Trial 3 finished wit

Trial 14 failed: Found array with 0 feature(s) (shape=(6632, 0)) while a minimum of 1 is required by DecisionTreeClassifier.


[I 2024-11-21 03:03:37,631] Trial 19 finished with value: 0.9263059046281169 and parameters: {'fs_method': 'tree', 'criterion': 'gini', 'splitter': 'best', 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.9282348501538925.
[I 2024-11-21 03:03:37,679] Trial 20 finished with value: 0.9161753232541585 and parameters: {'fs_method': 'univariate', 'selector__k': 18, 'criterion': 'gini', 'splitter': 'best', 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.9282348501538925.
[I 2024-11-21 03:03:38,896] Trial 21 finished with value: 0.9310101497043215 and parameters: {'fs_method': 'tree', 'criterion': 'gini', 'splitter': 'best', 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 21 with value: 0.9310101497043215.
[I 2024-11-21 03:03:40,062] Trial 22 finished with value: 0.9346277314730997 and parameters: {'fs_

Trial 44 failed: Found array with 0 feature(s) (shape=(6632, 0)) while a minimum of 1 is required by DecisionTreeClassifier.


[I 2024-11-21 03:04:04,421] Trial 45 finished with value: 0.9355928222780158 and parameters: {'fs_method': 'tree', 'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 45 with value: 0.9355928222780158.
[I 2024-11-21 03:04:04,542] Trial 46 finished with value: 0.7389925624095205 and parameters: {'fs_method': 'pca', 'selector__n_components': 30, 'criterion': 'entropy', 'splitter': 'random', 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 45 with value: 0.9355928222780158.
[I 2024-11-21 03:04:05,724] Trial 47 finished with value: 0.9346277314730995 and parameters: {'fs_method': 'tree', 'criterion': 'entropy', 'splitter': 'best', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 45 with value: 0.9355928222780158.
[I 2024-11-21 03:04:06,890] Trial 48 finished with value: 0.9349897586800366 an

KeyboardInterrupt: 

In [13]:
best_trials['KNN'].params

{'fs_method': 'univariate',
 'selector__k': 29,
 'n_neighbors': 4,
 'weights': 'distance',
 'algorithm': 'ball_tree',
 'metric': 'manhattan'}

In [14]:
for model_name in model_names:
    print(f'{model_name} had best trial with params: {best_trials[model_name].params} and accuracy {best_trials[model_name].value}')

KNN had best trial with params: {'fs_method': 'univariate', 'selector__k': 29, 'n_neighbors': 4, 'weights': 'distance', 'algorithm': 'ball_tree', 'metric': 'manhattan'} and accuracy 0.9610419025224113


KeyError: 'DTR'