In [1]:
import warnings

from deslib.tests.test_des_integration import voting

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

print("Processing HIGGS dataset...")
try:
    higgs = fetch_openml(name='HIGGS', version=1, as_frame=False)
except Exception as e:
    raise ValueError("Error downloading the HIGGS dataset. Check its availability on OpenML.") from e

X_higgs = higgs.data
y_higgs = pd.to_numeric(higgs.target, errors='coerce').astype(int)

X_higgs = X_higgs[:, ~pd.DataFrame(X_higgs).isna().any(axis=0)]

X_train, X_test, y_train, y_test = train_test_split(
    X_higgs, y_higgs, test_size=0.2, random_state=42
)

X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train,
                                                    test_size=0.2,
                                                    random_state=42)

Processing HIGGS dataset...


In [3]:
len(X_train), len(X_dsel), len(X_test), len(y_train), len(y_dsel), len(y_test)

(62752, 15688, 19610, 62752, 15688, 19610)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf, rf_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
best_rf.fit(X_train, y_train)

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {'n_neighbors': [3, 5, 7]}
knn = KNeighborsClassifier()
grid_knn = GridSearchCV(knn, knn_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_
best_knn.fit(X_train, y_train)

In [10]:
from sklearn.linear_model import LogisticRegression

lr_params = {'C': [0.1, 1, 10], 'max_iter': [100, 200]}
lr = LogisticRegression()
grid_lr = GridSearchCV(lr, lr_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_
best_lr.fit(X_train, y_train)

In [11]:
from sklearn.ensemble import ExtraTreesClassifier

et_params = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
et = ExtraTreesClassifier(random_state=42)
grid_et = GridSearchCV(et, et_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_et.fit(X_train, y_train)
best_et = grid_et.best_estimator_
best_et.fit(X_train, y_train)

In [12]:
from sklearn.linear_model import Perceptron
from sklearn.calibration import CalibratedClassifierCV

perceptron = CalibratedClassifierCV(Perceptron(max_iter=100,
                                               random_state=42))
perceptron.fit(X_train, y_train)

In [13]:
from sklearn.naive_bayes import GaussianNB

bayes = GaussianNB()
bayes.fit(X_train, y_train)

In [15]:
clf_pool = [best_rf, best_knn, best_lr, best_et, perceptron, bayes]

In [35]:
from deslib.des import KNORAU, KNORAE, KNOP, DESP, DESKNN

des_methods = {
    "KNORA-E": KNORAE,
    "KNORA-U": KNORAU,
    "KNOP": KNOP,
    "DESP": DESP,
    "DESKNN": DESKNN
}

In [50]:
des_param_grid = {
    "KNOP": {
        "k": [5],
        "DFP": [True, False],
        "knn_classifier": ["knn", None],
        "voting": ['soft', 'hard'],
    },
    "KNORA-E": {
        "k": [5],
        "DFP": [True, False],
        "knn_metric": ["minkowski", "mahalanobis"],
    },
    "KNORA-U": {
        "k": [5],
        "DFP": [True, False],
        "knn_metric": ["minkowski", "mahalanobis"],
    },

    "DESP": {
        "k": [5],
        "DFP": [True, False],
        "knn_metric": ["minkowski", "mahalanobis"],
        "selection_threshold": [0.5, 0.6, 0.7, 0.8],
    },
    "DESKNN": {
        "k": [5],
        "DFP": [True, False],
        "knn_metric": ["minkowski", "mahalanobis"],
        "weights": ["uniform", "distance"],
    }
}

In [51]:
import itertools

best_models = {}

for model_name, param_grid in des_param_grid.items():
    best_acc = 0
    best_params = None
    best_model = None

    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    for params in param_combinations:

        print(model_name, params)

        des_model = des_methods[model_name](pool_classifiers=clf_pool, **params)

        des_model.fit(X_dsel, y_dsel)

        y_pred = des_model.predict(X_test)

        acc = (y_pred == y_test).mean()

        print(acc)

        if acc > best_acc:
            best_acc = acc
            best_params = params
            best_model = des_model

    best_models[model_name] = {
        "model": best_model,
        "params": best_params,
        "score": best_acc
    }

KNOP {'k': 5, 'DFP': True, 'knn_classifier': 'knn', 'voting': 'soft'}
0.614176440591535
KNOP {'k': 5, 'DFP': True, 'knn_classifier': 'knn', 'voting': 'hard'}


  predicted_proba = w_votes / w_votes.sum(axis=1)[:, None]


0.6146863844977053
KNOP {'k': 5, 'DFP': True, 'knn_classifier': None, 'voting': 'soft'}
0.614176440591535
KNOP {'k': 5, 'DFP': True, 'knn_classifier': None, 'voting': 'hard'}


  predicted_proba = w_votes / w_votes.sum(axis=1)[:, None]


0.6146863844977053
KNOP {'k': 5, 'DFP': False, 'knn_classifier': 'knn', 'voting': 'soft'}
0.622284548699643
KNOP {'k': 5, 'DFP': False, 'knn_classifier': 'knn', 'voting': 'hard'}


  predicted_proba = w_votes / w_votes.sum(axis=1)[:, None]


0.617440081591025
KNOP {'k': 5, 'DFP': False, 'knn_classifier': None, 'voting': 'soft'}
0.622284548699643
KNOP {'k': 5, 'DFP': False, 'knn_classifier': None, 'voting': 'hard'}


  predicted_proba = w_votes / w_votes.sum(axis=1)[:, None]


0.617440081591025
KNORA-E {'k': 5, 'DFP': True, 'knn_metric': 'minkowski'}
0.6057623661397247
KNORA-E {'k': 5, 'DFP': True, 'knn_metric': 'mahalanobis'}
0.6036715961244263
KNORA-E {'k': 5, 'DFP': False, 'knn_metric': 'minkowski'}
0.6026007139214686
KNORA-E {'k': 5, 'DFP': False, 'knn_metric': 'mahalanobis'}
0.5977562468128506
KNORA-U {'k': 5, 'DFP': True, 'knn_metric': 'minkowski'}
0.6108108108108108
KNORA-U {'k': 5, 'DFP': True, 'knn_metric': 'mahalanobis'}
0.6106068332483426
KNORA-U {'k': 5, 'DFP': False, 'knn_metric': 'minkowski'}
0.6151963284038756
KNORA-U {'k': 5, 'DFP': False, 'knn_metric': 'mahalanobis'}
0.6105558388577257
DESP {'k': 5, 'DFP': True, 'knn_metric': 'minkowski', 'selection_threshold': 0.5}


TypeError: DESP.__init__() got an unexpected keyword argument 'selection_threshold'

KNORA-E даёт скор E(y_pred==y_test) -

In [30]:
import numpy as np

knarae = KNORAE(random_state=42, k=5, DFP=True, knn_metric="minkowski")
knarae.fit(X_dsel, y_dsel)

y_pred = knarae.predict(X_test)
print((y_pred == y_test).mean())

0.5375318714941356
