In [5]:
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

In [6]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

print("Processing HIGGS dataset...")
try:
    higgs = fetch_openml(name='HIGGS', version=1, as_frame=False)
except Exception as e:
    raise ValueError("Error downloading the HIGGS dataset. Check its availability on OpenML.") from e

X_higgs = higgs.data
y_higgs = pd.to_numeric(higgs.target, errors='coerce').astype(int)

X_higgs = X_higgs[:, ~pd.DataFrame(X_higgs).isna().any(axis=0)]

X_train, X_test, y_train, y_test = train_test_split(
    X_higgs, y_higgs, test_size=0.2, random_state=42
)

X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train,
                                                    test_size=0.5,
                                                    random_state=42)

Processing HIGGS dataset...


In [7]:
len(X_train), len(X_dsel), len(X_test), len(y_train), len(y_dsel), len(y_test)

(39220, 39220, 19610, 39220, 39220, 19610)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf, rf_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
best_rf.fit(X_train, y_train)

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {'n_neighbors': [3, 5, 7]}
knn = KNeighborsClassifier()
grid_knn = GridSearchCV(knn, knn_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_
best_knn.fit(X_train, y_train)

In [10]:
from sklearn.linear_model import LogisticRegression

lr_params = {'C': [0.1, 1, 10], 'max_iter': [100, 200]}
lr = LogisticRegression()
grid_lr = GridSearchCV(lr, lr_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_
best_lr.fit(X_train, y_train)

In [11]:
from sklearn.ensemble import ExtraTreesClassifier

et_params = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
et = ExtraTreesClassifier(random_state=42)
grid_et = GridSearchCV(et, et_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_et.fit(X_train, y_train)
best_et = grid_et.best_estimator_
best_et.fit(X_train, y_train)

In [12]:
from sklearn.linear_model import Perceptron
from sklearn.calibration import CalibratedClassifierCV

perceptron = CalibratedClassifierCV(Perceptron(max_iter=100,
                                               random_state=42))
perceptron.fit(X_train, y_train)

In [14]:
from sklearn.naive_bayes import GaussianNB

bayes = GaussianNB()
bayes.fit(X_train, y_train)

In [15]:
clf_pool = [best_rf, best_knn, best_lr, best_et, perceptron, bayes]

In [16]:
from deslib.des import KNORAU, KNORAE, METADES, DESP
from deslib.dcs import OLA, MCB

des_methods = {
    "KNORA-U": KNORAU(pool_classifiers=clf_pool),
    "KNORA-E": KNORAE(pool_classifiers=clf_pool),
    "META-DES": METADES(pool_classifiers=clf_pool),
    "DESP": DESP(pool_classifiers=clf_pool),
    "OLA": OLA(pool_classifiers=clf_pool),
    "MCB": MCB(pool_classifiers=clf_pool),
}

In [17]:
results = {}
for name, des in des_methods.items():
    des.fit(X_dsel, y_dsel)

In [19]:
for name, des in des_methods.items():
    y_pred = des.predict(X_test)
    accuracy = (y_test == y_pred).mean()
    results[name] = accuracy
    print(f"{name}: {accuracy:.4f}")

KNORA-U: 0.6105
KNORA-E: 0.6015
META-DES: 0.6016
DESP: 0.6129
OLA: 0.6036
MCB: 0.5958
