In [1]:
import os 
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
os.chdir('..')
from aco4ml import *
import plotly.express as px
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from time import perf_counter
import warnings


[SwarmPy] NumExpr defaulting to 8 threads.


In [2]:
digits = load_digits()

In [3]:

# params  = { 
#     'n_estimators': [200, 500],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4,5,6,7,8],
#     'criterion' :['gini', 'entropy']
# }

#params  = {'penalty' : ['l1', 'none','l2', 'elasticnet'], 'C' : [i/10 for i in range(1,10)], 'max_iter' : [100, 200, 300]}
params = {'criterion': ["gini", "entropy"] , 'splitter' : ['best', 'random'], 'min_samples_split': list(range(4,10)), 'min_samples_leaf' : list(range(1,5)), 'max_features' : ['auto', 'log2', 'sqrt']}

# X = np.random.uniform(size=50000).reshape((1000,50))
# y = (X[:,12]>0.5).astype(int)


X = digits['data']
y = digits['target']

cut = int(len(y)*.5)
X_train,X_test  = X[:cut,:],X[cut:,:]
y_train,y_test  = y[:cut],y[cut:]
n_features_kept = 8

In [4]:

G, id2hp, hp_map = Antcoder(params,X_train, y_train)
aco_pipeline = ACO_Pipeline(
    [
        ("Planner", Planner({"alpha": 1.0, "beta": 2.0, 'gamma' : 2.0, 'n_features_kept' : n_features_kept, 'n_hp' : len(params) })),
        ("Sol", SolutionConstructor(hp_map=hp_map)),
        ("DA", DaemonActions()),
        ("Updater", BestSoFarPheromonesUpdater(X=X_train, y=y_train, estimator = DecisionTreeClassifier, bounds=[.1, 5], Q=.05, evaporation_rate = .149)),

    ], 
    n_iter=20,
    id2hp=id2hp, 

)

In [5]:
start = perf_counter()
solutions = aco_pipeline.run(G)['solutions']
best_sol = max(solutions, key=lambda x: x[1])[0]
best_params = dict([id2hp[_id] for _id in best_sol[n_features_kept:]])
best_cols = best_sol[:n_features_kept]
time_aco = perf_counter()-start
lr_aco = DecisionTreeClassifier(**best_params).fit(X_train[:,best_cols], y_train)
print(time_aco)

SwarmPy | Score : [0.77173805]: 100%|██████████| 20/20 [00:10<00:00,  1.84it/s]

10.91176426099264





In [6]:
start = perf_counter()
rfe = RFE(DecisionTreeClassifier(), n_features_to_select=n_features_kept).fit(X_train, y_train)
middle = perf_counter()-start
support = rfe.get_support()
clf = GridSearchCV(estimator = DecisionTreeClassifier(), param_grid= params, refit=True)

clf.fit(X_train[:,support], y_train)
time_classic = perf_counter()-start
print(f'Temps FS : {middle} | temps HP : {time_classic-middle}')

Temps FS : 0.39325587498024106 | temps HP : 2.1332163640181534


In [7]:
aco_score = cross_val_score(lr_aco,X_test[:,best_cols],y_test, cv=10)
not_aco_score = cross_val_score(clf.best_estimator_,X_test[:,support],y_test, cv=10)
print(f' ACO : {np.mean(aco_score)=} |  {np.std(aco_score)=} ')
print(f' Not ACO : {np.mean(not_aco_score)=} |  {np.std(not_aco_score)=}')

 ACO : np.mean(aco_score)=0.7096504369538078 |  np.std(aco_score)=0.06796736556046644 
 Not ACO : np.mean(not_aco_score)=0.7186142322097377 |  np.std(not_aco_score)=0.0824414144683014


In [8]:
best_cols

[20, 21, 26, 27, 36, 38, 43, 44]

In [9]:
rfe.get_feature_names_out()

array(['x10', 'x18', 'x21', 'x27', 'x34', 'x36', 'x42', 'x61'],
      dtype=object)

In [10]:
best_params

{'criterion': 'entropy',
 'splitter': 'best',
 'min_samples_split': 6,
 'min_samples_leaf': 1,
 'max_features': 'log2'}

In [11]:
clf.best_score_

0.7093296089385476

In [12]:
datapoints = [[x,y] for x,y in enumerate(G['v'][0,: X.shape[1]])]

datapoints.sort(key = lambda x :x[1])

datapoints = np.array(datapoints)
px.bar(y = datapoints[:,1], x=datapoints[:,0].astype(int).astype(str), category_orders=datapoints[:,0].astype(str), text_auto=True)

In [13]:
px.imshow((max(rfe.ranking_) - rfe.ranking_.reshape(digits.images[0].shape))/(max(rfe.ranking_)))

In [14]:
px.imshow(G['v'][0,: X.shape[1]].reshape(digits.images[0].shape))