In [70]:
"""
author: Dominik Cedro & Michał WIktor Dziak
date: 27th January 2025
description: Feature selection using boruta and cross validation grid search to validate the best classifier among chosen ones.
"""
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [71]:
df = pd.read_csv('data/table.csv')
df.head()

Unnamed: 0,mean_value_ML,mean_value_AP,mean_distance_ML,mean_distance_AP,mean_distance_Radius,maximal_distance_ML,maximal_distance_AP,maximal_distance_Radius,rms_ML,rms_AP,...,critical_displacement_Diffusion_ML,short_time_scaling_Diffusion_ML,long_time_scaling_Diffusion_ML,short_time_diffusion_Diffusion_AP,long_time_diffusion_Diffusion_AP,critical_time_Diffusion_AP,critical_displacement_Diffusion_AP,short_time_scaling_Diffusion_AP,long_time_scaling_Diffusion_AP,Label
0,0.046123,-1.461512,0.385651,0.543167,0.73073,1.990957,3.392738,3.563114,0.486396,0.713827,...,0.387522,0.827853,0.010303,1.509905,0.919846,0.731532,0.927016,0.780243,-0.012418,C
1,0.0425,-0.365777,0.532939,0.484075,0.806206,2.655908,1.882897,2.862772,0.692067,0.611752,...,0.941689,0.824806,-0.005626,1.397278,0.788905,0.703945,0.789882,0.812403,-0.001764,C
2,0.496358,-1.401023,0.364302,0.400104,0.606724,2.190242,1.755362,2.219504,0.481193,0.506953,...,0.279504,0.827185,0.073962,0.812678,0.352416,0.580691,0.334341,0.817032,0.048434,C
3,0.314393,-0.549541,0.573516,0.48697,0.825558,2.559821,1.802601,2.829481,0.721078,0.59938,...,1.037714,0.858537,0.00029,1.850512,0.334793,0.331613,0.268558,0.87433,0.099857,C
4,1.412529,0.186249,0.929037,1.09483,1.59927,3.859579,4.202153,4.317307,1.164596,1.377935,...,2.303989,0.840059,0.028184,9.677386,2.811869,0.427816,2.63117,0.766941,0.039114,C


In [72]:
df["Label"] = pd.factorize(df["Label"])[0] # C = 0     P = 1
df.tail() 

Unnamed: 0,mean_value_ML,mean_value_AP,mean_distance_ML,mean_distance_AP,mean_distance_Radius,maximal_distance_ML,maximal_distance_AP,maximal_distance_Radius,rms_ML,rms_AP,...,critical_displacement_Diffusion_ML,short_time_scaling_Diffusion_ML,long_time_scaling_Diffusion_ML,short_time_diffusion_Diffusion_AP,long_time_diffusion_Diffusion_AP,critical_time_Diffusion_AP,critical_displacement_Diffusion_AP,short_time_scaling_Diffusion_AP,long_time_scaling_Diffusion_AP,Label
100,2.605348,-2.030942,0.335179,0.524191,0.686425,1.416125,1.70297,1.717904,0.431635,0.636464,...,0.110204,0.766259,0.180948,0.727823,0.475327,0.743096,0.452061,0.801942,0.084507,1
101,0.776783,1.222163,0.541288,0.659628,0.945852,2.025473,2.542295,2.836388,0.673406,0.851786,...,0.568117,0.893255,0.05532,1.647572,0.497987,0.426641,0.359597,0.893434,0.191116,1
102,-0.708497,-0.800798,0.466417,0.711986,0.933087,2.423353,3.586807,3.60008,0.606632,0.907559,...,0.650107,0.825616,0.015828,1.722177,0.926328,0.612255,0.859127,0.708744,0.076754,1
103,0.268076,-1.346882,0.930665,0.949494,1.448217,4.199459,3.079528,4.447592,1.158516,1.114526,...,1.823896,0.869994,0.095945,1.884861,0.719936,0.464423,0.507631,0.85523,0.227788,1
104,3.524215,-1.546033,0.291584,0.58036,0.693267,1.135226,2.508233,2.520035,0.359408,0.73194,...,0.154795,0.815621,0.074844,0.756592,0.28068,0.444002,0.212372,0.782388,0.171737,1


In [73]:
forest = RandomForestRegressor(n_jobs = -1,max_depth = 10)

In [74]:
boruta = BorutaPy(estimator = forest, n_estimators = 'auto',max_iter = 50,)

In [75]:
X = df.drop(columns=["Label"])
y = df["Label"]

boruta.fit(X.values, y.values)

green_area = X.columns[boruta.support_].to_list()
blue_area = X.columns[boruta.support_weak_].to_list()
print('Selected Features:', green_area)
print('Blue area features:', blue_area)

Selected Features: ['zero_crossing_SPD_ML', 'zero_crossing_SPD_AP', 'mean_frequency_ML_AND_AP', 'power_frequency_95_Power_Spectrum_Density_ML']
Blue area features: ['frequency_dispersion_Power_Spectrum_Density_AP']


In [80]:
best_feature = green_area[0]

X_train, X_test, y_train, y_test = train_test_split(X[[best_feature]], y, test_size=0.3, random_state=41)

classifiers = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

param_grids = {
    'RandomForest': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.7, 0.8]
    }
}

best_classifiers = {}
for clf_name in classifiers:
    clf = classifiers[clf_name]
    param_grid = param_grids[clf_name]
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_classifiers[clf_name] = grid_search.best_estimator_

scores = {}
for clf_name, clf in best_classifiers.items():
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    scores[clf_name] = score


In [81]:
for clf_name, score in scores.items():
    print(f'{clf_name} Accuracy: {np.mean(score):.4f} ± {np.std(score):.4f}')

RandomForest Accuracy: 0.7124 ± 0.0513
GradientBoosting Accuracy: 0.7267 ± 0.0570


RandomForest Accuracy: 0.7514 ± 0.0763
GradientBoosting Accuracy: 0.7390 ± 0.0527

In [82]:
best_feature = green_area[0]

X_train, X_test, y_train, y_test = train_test_split(X[[best_feature]], y, test_size=0.3, random_state=41)

classifiers = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

param_grids = {
    'RandomForest': {
        'n_estimators': [50, 100, 200, 300, 400],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 200, 300, 400],
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.7, 0.8]
    }
}

best_classifiers = {}
for clf_name in classifiers:
    clf = classifiers[clf_name]
    param_grid = param_grids[clf_name]
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_classifiers[clf_name] = grid_search.best_estimator_

scores = {}
for clf_name, clf in best_classifiers.items():
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    scores[clf_name] = score

for clf_name, score in scores.items():
    print(f'{clf_name} Accuracy: {np.mean(score):.4f} ± {np.std(score):.4f}')


# this fit resulted in following values: 
# RandomForest Accuracy: 0.7514 ± 0.0763
# GradientBoosting Accuracy: 0.7390 ± 0.0527

RandomForest Accuracy: 0.7514 ± 0.0763
GradientBoosting Accuracy: 0.7390 ± 0.0527


In [None]:


# Define the classifiers and their parameter grids
classifiers = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'SVC': SVC(random_state=42)
}

param_grids = {
    'RandomForest': {
        'n_estimators': [50, 100, 200, 300, 400],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 200, 300, 400],
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.7, 0.8]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200, 300, 400],
        'learning_rate': [0.01, 0.1, 0.2, 0.3]
    },
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    'SVC': {
        'C': [0.01, 0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }
}

# Perform grid search for each classifier
best_classifiers = {}
for clf_name in classifiers:
    clf = classifiers[clf_name]
    param_grid = param_grids[clf_name]
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=True)
    grid_search.fit(X_train, y_train)
    best_classifiers[clf_name] = grid_search.best_estimator_

# Evaluate each classifier using cross_val_score
scores = {}
for clf_name, clf in tqdm(best_classifiers.items()):
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    scores[clf_name] = score

# Print the accuracy scores for each classifier
for clf_name, score in scores.items():
    print(f'{clf_name} Accuracy: {np.mean(score):.4f} ± {np.std(score):.4f}')

Fitting 5 folds for each of 225 candidates, totalling 1125 fits
Fitting 5 folds for each of 80 candidates, totalling 400 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
