In [1]:
from sklearn.utils import all_estimators
from sklearn.base import ClassifierMixin
from sklearn.linear_model import LogisticRegression
from utilities import *
from sklearn.ensemble import RandomForestClassifier
from inspect import signature
import csv
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix

In [2]:
X_train, X_test, y_train, y_test, pd_passthrough_train, pd_passthrough_test = prepare_data(data_subdir="ZMUMU_EGZ_extended_reduced_ET", format_mode="reduced_SuperCell_ET")

found preprepared data in ..\data\ZMUMU_EGZ_extended_reduced_ET


In [3]:
all_estimators_list = all_estimators(type_filter="classifier")

binary_classifiers = {}
default_base_estimator = LogisticRegression()

for name, Classifier in all_estimators_list:
    try:
        # Handle meta-estimators requiring base estimators or parameters
        if name in [
            "ClassifierChain",
            "FixedThresholdClassifier",
            "MultiOutputClassifier",
            "OneVsOneClassifier",
            "OneVsRestClassifier",
            "OutputCodeClassifier",
            "StackingClassifier",
            "TunedThresholdClassifierCV",
            "VotingClassifier",
        ]:
            if "ClassifierChain" == name:
                clf = Classifier(base_estimator=default_base_estimator)
            elif name in [
                "MultiOutputClassifier",
                "OneVsOneClassifier",
                "OneVsRestClassifier",
                "OutputCodeClassifier",
                "FixedThresholdClassifier",
                "TunedThresholdClassifierCV",
            ]:
                clf = Classifier(estimator=default_base_estimator)
            elif name == "StackingClassifier":
                clf = Classifier(estimators=[("lr", LogisticRegression())])
            elif name == "VotingClassifier":
                clf = Classifier(estimators=[("lr", LogisticRegression())])
        else:
            # Instantiate directly for non-meta classifiers
            clf = Classifier()

        if hasattr(clf, "predict"):  # Check for predict method
            binary_classifiers[name] = Classifier
            globals()[name] = Classifier  # Dynamically add to global namespace
    except Exception as e:
        print(f"Could not import {name}: {e}")

# Verify imported classifiers
print(f"Imported {len(binary_classifiers)} binary classifiers:")
print(list(binary_classifiers.keys()))


Imported 42 binary classifiers:
['AdaBoostClassifier', 'BaggingClassifier', 'BernoulliNB', 'CalibratedClassifierCV', 'CategoricalNB', 'ClassifierChain', 'ComplementNB', 'DecisionTreeClassifier', 'DummyClassifier', 'ExtraTreeClassifier', 'ExtraTreesClassifier', 'FixedThresholdClassifier', 'GaussianNB', 'GaussianProcessClassifier', 'GradientBoostingClassifier', 'HistGradientBoostingClassifier', 'KNeighborsClassifier', 'LabelPropagation', 'LabelSpreading', 'LinearDiscriminantAnalysis', 'LinearSVC', 'LogisticRegression', 'LogisticRegressionCV', 'MLPClassifier', 'MultiOutputClassifier', 'MultinomialNB', 'NearestCentroid', 'NuSVC', 'OneVsOneClassifier', 'OneVsRestClassifier', 'OutputCodeClassifier', 'PassiveAggressiveClassifier', 'Perceptron', 'QuadraticDiscriminantAnalysis', 'RadiusNeighborsClassifier', 'RandomForestClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'SVC', 'TunedThresholdClassifierCV', 'VotingClassifier']


In [4]:

# binary_classifiers_short = dict(list(binary_classifiers.items())[:10])
results = []
for name, Classifier in binary_classifiers.items():
    try:
        if name in [
            "ClassifierChain",
            "FixedThresholdClassifier",
            "MultiOutputClassifier",
            "OneVsOneClassifier",
            "OneVsRestClassifier",
            "OutputCodeClassifier",
            "StackingClassifier",
            "TunedThresholdClassifierCV",
            "VotingClassifier",
        ]:
            if name == "ClassifierChain":
                clf = Classifier(base_estimator=LogisticRegression())
            elif name in [
                "MultiOutputClassifier",
                "OneVsOneClassifier",
                "OneVsRestClassifier",
                "OutputCodeClassifier",
                "FixedThresholdClassifier",
                "TunedThresholdClassifierCV",
            ]:
                clf = Classifier(estimator=LogisticRegression())
            elif name == "StackingClassifier":
                clf = Classifier(estimators=[("lr", LogisticRegression()), ("rf", RandomForestClassifier())])
            elif name == "VotingClassifier":
                clf = Classifier(estimators=[("lr", LogisticRegression()), ("rf", RandomForestClassifier())])
        else:
            # Check if random_state is a parameter and set it if available
            params = signature(Classifier).parameters
            if "random_state" in params:
                clf = Classifier(random_state=42)
            else:
                clf = Classifier()

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        # Save results
        results.append({
            "Classifier": name,
            "Accuracy": accuracy,
            "MSE": mse,
            "TN": tn,
            "FP": fp,
            "FN": fn,
            "TP": tp,
        })

        evaluate_sklearn_model(y_test, y_pred, show_CR=False, show_MSE=True, model_name=f'{name}')
    except Exception as e:
        # On error, save classifier name with NULL values
        print(f"Could not evaluate {name}: {e}\n")
        results.append({
            "Classifier": name,
            "Accuracy": "NULL",
            "MSE": "NULL",
            "TN": "NULL",
            "FP": "NULL",
            "FN": "NULL",
            "TP": "NULL",
            })




Evaluation of AdaBoostClassifier
Accuracy: 0.9219
Confusion Matrix:
 [[26915  1695]
 [ 1912 15660]]
Mean Squared Error: 0.0781

Evaluation of BaggingClassifier
Accuracy: 0.9559
Confusion Matrix:
 [[27488  1122]
 [  915 16657]]
Mean Squared Error: 0.0441

Evaluation of BernoulliNB
Accuracy: 0.7006
Confusion Matrix:
 [[16392 12218]
 [ 1610 15962]]
Mean Squared Error: 0.2994

Evaluation of CalibratedClassifierCV
Accuracy: 0.9334
Confusion Matrix:
 [[26930  1680]
 [ 1394 16178]]
Mean Squared Error: 0.0666

Could not evaluate CategoricalNB: index 114 is out of bounds for axis 1 with size 56

Could not evaluate ClassifierChain: tuple index out of range

Evaluation of ComplementNB
Accuracy: 0.7639
Confusion Matrix:
 [[18520 10090]
 [  814 16758]]
Mean Squared Error: 0.2361

Evaluation of DecisionTreeClassifier
Accuracy: 0.9340
Confusion Matrix:
 [[26978  1632]
 [ 1416 16156]]
Mean Squared Error: 0.0660

Evaluation of DummyClassifier
Accuracy: 0.6195
Confusion Matrix:
 [[28610     0]
 [17572  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of FixedThresholdClassifier
Accuracy: 0.9348
Confusion Matrix:
 [[26953  1657]
 [ 1352 16220]]
Mean Squared Error: 0.0652

Evaluation of GaussianNB
Accuracy: 0.6982
Confusion Matrix:
 [[15500 13110]
 [  829 16743]]
Mean Squared Error: 0.3018

Could not evaluate GaussianProcessClassifier: Unable to allocate 254. GiB for an array with shape (184728, 184728) and data type float64

Evaluation of GradientBoostingClassifier
Accuracy: 0.9385
Confusion Matrix:
 [[27006  1604]
 [ 1234 16338]]
Mean Squared Error: 0.0615

Evaluation of HistGradientBoostingClassifier
Accuracy: 0.9651
Confusion Matrix:
 [[27453  1157]
 [  455 17117]]
Mean Squared Error: 0.0349

Evaluation of KNeighborsClassifier
Accuracy: 0.9375
Confusion Matrix:
 [[26079  2531]
 [  355 17217]]
Mean Squared Error: 0.0625

Could not evaluate LabelPropagation: Unable to allocate 254. GiB for an array with shape (184728, 184728) and data type float64

Could not evaluate LabelSpreading: Unable to allocate 254. GiB for an arr

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of LogisticRegression
Accuracy: 0.9348
Confusion Matrix:
 [[26953  1657]
 [ 1352 16220]]
Mean Squared Error: 0.0652



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Evaluation of LogisticRegressionCV
Accuracy: 0.9347
Confusion Matrix:
 [[26940  1670]
 [ 1346 16226]]
Mean Squared Error: 0.0653

Evaluation of MLPClassifier
Accuracy: 0.9682
Confusion Matrix:
 [[27693   917]
 [  552 17020]]
Mean Squared Error: 0.0318

Could not evaluate MultiOutputClassifier: y must have at least two dimensions for multi-output regression but has only one.

Evaluation of MultinomialNB
Accuracy: 0.7795
Confusion Matrix:
 [[19332  9278]
 [  903 16669]]
Mean Squared Error: 0.2205

Evaluation of NearestCentroid
Accuracy: 0.8401
Confusion Matrix:
 [[24574  4036]
 [ 3350 14222]]
Mean Squared Error: 0.1599

Evaluation of NuSVC
Accuracy: 0.8969
Confusion Matrix:
 [[26097  2513]
 [ 2247 15325]]
Mean Squared Error: 0.1031



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of OneVsOneClassifier
Accuracy: 0.9348
Confusion Matrix:
 [[26953  1657]
 [ 1352 16220]]
Mean Squared Error: 0.0652



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of OneVsRestClassifier
Accuracy: 0.9348
Confusion Matrix:
 [[26953  1657]
 [ 1352 16220]]
Mean Squared Error: 0.0652



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Evaluation of OutputCodeClassifier
Accuracy: 0.9348
Confusion Matrix:
 [[26953  1657]
 [ 1352 16220]]
Mean Squared Error: 0.0652

Evaluation of PassiveAggressiveClassifier
Accuracy: 0.7843
Confusion Matrix:
 [[27817   793]
 [ 9167  8405]]
Mean Squared Error: 0.2157

Evaluation of Perceptron
Accuracy: 0.7738
Confusion Matrix:
 [[27482  1128]
 [ 9320  8252]]
Mean Squared Error: 0.2262

Evaluation of QuadraticDiscriminantAnalysis
Accuracy: 0.7063
Confusion Matrix:
 [[15867 12743]
 [  820 16752]]
Mean Squared Error: 0.2937

Could not evaluate RadiusNeighborsClassifier: No neighbors found for test samples array([    2,     3,     4, ..., 46179, 46180, 46181]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.

Evaluation of RandomForestClassifier
Accuracy: 0.9672
Confusion Matrix:
 [[27542  1068]
 [  446 17126]]
Mean Squared Error: 0.0328

Evaluation of RidgeClassifier
Accuracy: 0.8728
Confusion Matrix:
 [[26391  2219]
 [ 3657 1391

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Evaluation of TunedThresholdClassifierCV
Accuracy: 0.9357
Confusion Matrix:
 [[26383  2227]
 [  742 16830]]
Mean Squared Error: 0.0643



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of VotingClassifier
Accuracy: 0.9497
Confusion Matrix:
 [[27806   804]
 [ 1518 16054]]
Mean Squared Error: 0.0503



In [5]:
output_file = "sklearn_all_results_reduced.csv"
with open(output_file, mode="w", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=["Classifier", "Accuracy", "MSE", "TN", "FP", "FN", "TP"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_file}")

Results saved to sklearn_all_results_reduced.csv
