In [1]:
import openml
import tpot2
import sklearn.metrics
import sklearn
from sklearn.metrics import (roc_auc_score, log_loss)
import traceback
import dill as pickle
import os
import time
import numpy as np
import sklearn.model_selection
import pandas as pd

from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
def load_task(task_id, preprocess=True):

    cached_data_path = f"data/{task_id}_{preprocess}.pkl"
    print(cached_data_path)
    if os.path.exists(cached_data_path):
        d = pickle.load(open(cached_data_path, "rb"))
        X_train, y_train, X_test, y_test = d['X_train'], d['y_train'], d['X_test'], d['y_test']
    else:
        task = openml.tasks.get_task(task_id)


        X, y = task.get_X_and_y(dataset_format="dataframe")
        train_indices, test_indices = task.get_train_test_split_indices()
        X_train = X.iloc[train_indices]
        y_train = y.iloc[train_indices]
        X_test = X.iloc[test_indices]
        y_test = y.iloc[test_indices]

        if preprocess:
            preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'), tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'), tpot2.builtin_modules.ColumnOneHotEncoder("categorical", min_frequency=0.001, handle_unknown="ignore"))
            X_train = preprocessing_pipeline.fit_transform(X_train)
            X_test = preprocessing_pipeline.transform(X_test)


            le = sklearn.preprocessing.LabelEncoder()
            y_train = le.fit_transform(y_train)
            y_test = le.transform(y_test)

            X_train = X_train.to_numpy()
            X_test = X_test.to_numpy()

            if task_id == 168795: #this task does not have enough instances of two classes for 10 fold CV. This function samples the data to make sure we have at least 10 instances of each class
                indices = [28535, 28535, 24187, 18736,  2781]
                y_train = np.append(y_train, y_train[indices])
                X_train = np.append(X_train, X_train[indices], axis=0)

            d = {"X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test}
            if not os.path.exists("data"):
                os.makedirs("data")
            with open(cached_data_path, "wb") as f:
                pickle.dump(d, f)

    return X_train, y_train, X_test, y_test

In [3]:
def create_stacking_clf(pareto_front):
    estimators = []
    highest_accuracy = 0
    for i in range(len(pareto_front)):
        fitted_pipeline = pareto_front.iloc[i, 10].fit(X_train, y_train)
        pipeline_accuracy = accuracy_score(y_test, fitted_pipeline.predict(X_test))

        if pipeline_accuracy > highest_accuracy:
            highest_accuracy = pipeline_accuracy
        
        fitted_pipeline_tuple = ((str(i), fitted_pipeline))
        estimators.append(fitted_pipeline_tuple)

    stacking_classifier = StackingClassifier(estimators=estimators, 
                                         final_estimator=VotingClassifier(estimators=estimators, voting='hard'), 
                                         cv="prefit",
                                         passthrough=False)
    
    stacking_classifier.fit(X_train, y_train) 

    return stacking_classifier, highest_accuracy

In [4]:
def create_stacking_clf_gridsearch(pareto_front):
    estimators = []
    highest_accuracy = 0
    for i in range(len(pareto_front)):
        fitted_pipeline = pareto_front.iloc[i, 10].fit(X_train, y_train)
        pipeline_accuracy = accuracy_score(y_test, fitted_pipeline.predict(X_test))

        if pipeline_accuracy > highest_accuracy:
            highest_accuracy = pipeline_accuracy
        
        fitted_pipeline_tuple = ((str(i), fitted_pipeline))
        estimators.append(fitted_pipeline_tuple)

    ensemble_model_params = {
        'passthrough': [True, False],  
        'final_estimator': [
            VotingClassifier(estimators=estimators, voting='soft'), 
            VotingClassifier(estimators=estimators, voting='hard'),
            #LogisticRegression()
        ]
    }

    # print all 4 ensemble options to the final table
    
    ensemble_grid_search = GridSearchCV(estimator=StackingClassifier(estimators=estimators, cv="prefit"), 
                                        param_grid=ensemble_model_params, 
                                        cv=5)


    ensemble_grid_search.fit(X_train, y_train)
    
    return ensemble_grid_search, highest_accuracy

## Without grid search

In [5]:
# task_ids = [167104, 167184, 167168, 167161, 189905]
# results = []

# for task_id in task_ids:
#     X_train, y_train, X_test, y_test = load_task(task_id, preprocess=True)
#     individual_highest_accuracy = 0
#     est = tpot2.TPOTEstimator(generations=5, population_size=5, cv=5, 
#                           random_state=42, verbose=2, classification=True, scorers=['roc_auc_ovr',tpot2.objectives.complexity_scorer], 
#                           scorers_weights=[1,-1])
#     est.fit(X_train, y_train)
#     fitted_ensemble, individual_highest_accuracy = create_stacking_clf(est.pareto_front)

#     ensemble_accuracy = accuracy_score(y_test, fitted_ensemble.predict(X_test))

#     results.append({"task id": task_id, "individual": individual_highest_accuracy, "ensemble": ensemble_accuracy})

# print("Without grid search")
# results_df = pd.DataFrame(results)
# results_df

## With grid search

In [8]:
task_ids = [167104, 167184, 167168, 167161, 189905]
results = []
num_runs = 3

for task_id in task_ids:
    for i in range(num_runs):
        X_train, y_train, X_test, y_test = load_task(task_id, preprocess=True)
        individual_highest_accuracy = 0
        est = tpot2.TPOTEstimator(generations=5, population_size=5, cv=5, 
                              random_state=30+i, verbose=2, classification=True, scorers=['roc_auc_ovr',tpot2.objectives.complexity_scorer], 
                              scorers_weights=[1,-1])
        est.fit(X_train, y_train)
        fitted_ensemble, individual_highest_accuracy = create_stacking_clf_gridsearch(est.pareto_front)
    
        ensemble_accuracy = accuracy_score(y_test, fitted_ensemble.predict(X_test))
    
        results.append({"task id": task_id, 
                        "individual": individual_highest_accuracy, 
                        "ensemble": ensemble_accuracy, 
                        "better": (ensemble_accuracy>individual_highest_accuracy),
                        "random seed": 30+i,
                        "run #": i
                       })

print("With grid search")
results_df = pd.DataFrame(results)
results_df

data/167104_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:21<00:00,  4.26s/it]
  f = msb / msw
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py", line 695, in predict
    y_pred = super().predict(X, **predict_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/l

data/167104_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:36<00:00,  7.32s/it]


data/167104_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [01:24<00:00, 16.89s/it]
10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py", line 672, in fit
    return super().fit(X, y_encoded, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *arg

data/167184_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:16<00:00,  3.21s/it]


data/167184_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:07<00:00,  1.54s/it]


data/167184_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:07<00:00,  1.44s/it]


data/167168_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:15<00:00,  3.08s/it]


data/167168_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:46<00:00,  9.26s/it]


data/167168_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [01:10<00:00, 14.19s/it]


data/167161_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:25<00:00,  5.07s/it]


data/167161_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:52<00:00, 10.41s/it]


data/167161_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [02:14<00:00, 26.86s/it]
10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py", line 672, in fit
    return super().fit(X, y_encoded, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *arg

data/189905_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:26<00:00,  5.31s/it]


data/189905_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:30<00:00,  6.06s/it]


data/189905_True.pkl


Generation: 100%|█████████████████████████████████| 5/5 [00:47<00:00,  9.57s/it]


With grid search


Unnamed: 0,task id,individual,ensemble,better,random seed,run #
0,167104,0.845815,0.845815,False,30,0
1,167104,0.85022,0.837004,False,31,1
2,167104,0.837004,0.845815,True,32,2
3,167184,0.800813,0.800813,False,30,0
4,167184,0.845528,0.841463,False,31,1
5,167184,0.784553,0.747967,False,32,2
6,167168,0.72043,0.774194,True,30,0
7,167168,0.81362,0.716846,False,31,1
8,167168,0.802867,0.810036,True,32,2
9,167161,0.7,0.678788,False,30,0


In [9]:
num_better = results_df['better'].sum()

print("Percent that are better", num_better / results_df['better'].count())

Percent that are better 0.3333333333333333
