In [1]:
from sklearn.utils import all_estimators
from sklearn.base import ClassifierMixin
from sklearn.linear_model import LogisticRegression
from utilities import *
from xgboost import XGBClassifier
from inspect import signature
import csv
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train, X_test, y_train, y_test, pd_passthrough_train, pd_passthrough_test = prepare_data()

found preprepared data in ..\data\ZMUMU_EGZ_extended_np_pd


In [3]:
all_estimators_list = all_estimators(type_filter="classifier")

binary_classifiers = {}
default_base_estimator = LogisticRegression()

for name, Classifier in all_estimators_list:
    try:
        # Handle meta-estimators requiring base estimators or parameters
        if name in [
            "ClassifierChain",
            "FixedThresholdClassifier",
            "MultiOutputClassifier",
            "OneVsOneClassifier",
            "OneVsRestClassifier",
            "OutputCodeClassifier",
            "StackingClassifier",
            "TunedThresholdClassifierCV",
            "VotingClassifier",
        ]:
            if "ClassifierChain" == name:
                clf = Classifier(base_estimator=default_base_estimator)
            elif name in [
                "MultiOutputClassifier",
                "OneVsOneClassifier",
                "OneVsRestClassifier",
                "OutputCodeClassifier",
                "FixedThresholdClassifier",
                "TunedThresholdClassifierCV",
            ]:
                clf = Classifier(estimator=default_base_estimator)
            elif name == "StackingClassifier":
                clf = Classifier(estimators=[("lr", LogisticRegression())])
            elif name == "VotingClassifier":
                clf = Classifier(estimators=[("lr", LogisticRegression())])
        else:
            # Instantiate directly for non-meta classifiers
            clf = Classifier()

        if hasattr(clf, "predict"):  # Check for predict method
            binary_classifiers[name] = Classifier
            globals()[name] = Classifier  # Dynamically add to global namespace
    except Exception as e:
        print(f"Could not import {name}: {e}")

# Verify imported classifiers
print(f"Imported {len(binary_classifiers)} binary classifiers:")
print(list(binary_classifiers.keys()))


Imported 42 binary classifiers:
['AdaBoostClassifier', 'BaggingClassifier', 'BernoulliNB', 'CalibratedClassifierCV', 'CategoricalNB', 'ClassifierChain', 'ComplementNB', 'DecisionTreeClassifier', 'DummyClassifier', 'ExtraTreeClassifier', 'ExtraTreesClassifier', 'FixedThresholdClassifier', 'GaussianNB', 'GaussianProcessClassifier', 'GradientBoostingClassifier', 'HistGradientBoostingClassifier', 'KNeighborsClassifier', 'LabelPropagation', 'LabelSpreading', 'LinearDiscriminantAnalysis', 'LinearSVC', 'LogisticRegression', 'LogisticRegressionCV', 'MLPClassifier', 'MultiOutputClassifier', 'MultinomialNB', 'NearestCentroid', 'NuSVC', 'OneVsOneClassifier', 'OneVsRestClassifier', 'OutputCodeClassifier', 'PassiveAggressiveClassifier', 'Perceptron', 'QuadraticDiscriminantAnalysis', 'RadiusNeighborsClassifier', 'RandomForestClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'SVC', 'TunedThresholdClassifierCV', 'VotingClassifier']


In [6]:

# binary_classifiers_short = dict(list(binary_classifiers.items())[:10])
results = []
for name, Classifier in binary_classifiers.items():
    try:
        if name in [
            "ClassifierChain",
            "FixedThresholdClassifier",
            "MultiOutputClassifier",
            "OneVsOneClassifier",
            "OneVsRestClassifier",
            "OutputCodeClassifier",
            "StackingClassifier",
            "TunedThresholdClassifierCV",
            "VotingClassifier",
        ]:
            if name == "ClassifierChain":
                clf = Classifier(base_estimator=LogisticRegression())
            elif name in [
                "MultiOutputClassifier",
                "OneVsOneClassifier",
                "OneVsRestClassifier",
                "OutputCodeClassifier",
                "FixedThresholdClassifier",
                "TunedThresholdClassifierCV",
            ]:
                clf = Classifier(estimator=LogisticRegression())
            elif name == "StackingClassifier":
                clf = Classifier(estimators=[("lr", LogisticRegression()), ("rf", RandomForestClassifier())])
            elif name == "VotingClassifier":
                clf = Classifier(estimators=[("lr", LogisticRegression()), ("rf", RandomForestClassifier())])
        else:
            # Check if random_state is a parameter and set it if available
            params = signature(Classifier).parameters
            if "random_state" in params:
                clf = Classifier(random_state=42)
            else:
                clf = Classifier()

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        # Save results
        results.append({
            "Classifier": name,
            "Accuracy": accuracy,
            "MSE": mse,
            "TN": tn,
            "FP": fp,
            "FN": fn,
            "TP": tp,
        })

        evaluate_sklearn_model(y_test, y_pred, show_CR=False, show_MSE=True, model_name=f'{name}')
    except Exception as e:
        # On error, save classifier name with NULL values
        print(f"Could not evaluate {name}: {e}\n")
        results.append({
            "Classifier": name,
            "Accuracy": "NULL",
            "MSE": "NULL",
            "TN": "NULL",
            "FP": "NULL",
            "FN": "NULL",
            "TP": "NULL",
            })




Evaluation of AdaBoostClassifier
Accuracy: 0.9277
Confusion Matrix:
 [[27143  1467]
 [ 1873 15699]]
Mean Squared Error: 0.0723

Evaluation of BaggingClassifier
Accuracy: 0.9565
Confusion Matrix:
 [[27458  1152]
 [  859 16713]]
Mean Squared Error: 0.0435

Evaluation of BernoulliNB
Accuracy: 0.7146
Confusion Matrix:
 [[16632 11978]
 [ 1204 16368]]
Mean Squared Error: 0.2854

Evaluation of CalibratedClassifierCV
Accuracy: 0.9410
Confusion Matrix:
 [[26967  1643]
 [ 1082 16490]]
Mean Squared Error: 0.0590

Could not evaluate CategoricalNB: index 92 is out of bounds for axis 1 with size 30

Could not evaluate ClassifierChain: tuple index out of range

Evaluation of ComplementNB
Accuracy: 0.7709
Confusion Matrix:
 [[18389 10221]
 [  358 17214]]
Mean Squared Error: 0.2291

Evaluation of DecisionTreeClassifier
Accuracy: 0.9345
Confusion Matrix:
 [[27002  1608]
 [ 1419 16153]]
Mean Squared Error: 0.0655

Evaluation of DummyClassifier
Accuracy: 0.6195
Confusion Matrix:
 [[28610     0]
 [17572   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of FixedThresholdClassifier
Accuracy: 0.9413
Confusion Matrix:
 [[26979  1631]
 [ 1078 16494]]
Mean Squared Error: 0.0587

Evaluation of GaussianNB
Accuracy: 0.7167
Confusion Matrix:
 [[16231 12379]
 [  705 16867]]
Mean Squared Error: 0.2833

Could not evaluate GaussianProcessClassifier: Unable to allocate 254. GiB for an array with shape (184728, 184728) and data type float64

Evaluation of GradientBoostingClassifier
Accuracy: 0.9427
Confusion Matrix:
 [[27073  1537]
 [ 1109 16463]]
Mean Squared Error: 0.0573

Evaluation of HistGradientBoostingClassifier
Accuracy: 0.9672
Confusion Matrix:
 [[27499  1111]
 [  402 17170]]
Mean Squared Error: 0.0328

Evaluation of KNeighborsClassifier
Accuracy: 0.9379
Confusion Matrix:
 [[26070  2540]
 [  328 17244]]
Mean Squared Error: 0.0621

Could not evaluate LabelPropagation: Unable to allocate 254. GiB for an array with shape (184728, 184728) and data type float64

Could not evaluate LabelSpreading: Unable to allocate 254. GiB for an arr

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of LogisticRegression
Accuracy: 0.9413
Confusion Matrix:
 [[26979  1631]
 [ 1078 16494]]
Mean Squared Error: 0.0587



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Evaluation of LogisticRegressionCV
Accuracy: 0.9418
Confusion Matrix:
 [[26982  1628]
 [ 1059 16513]]
Mean Squared Error: 0.0582

Evaluation of MLPClassifier
Accuracy: 0.9683
Confusion Matrix:
 [[27617   993]
 [  472 17100]]
Mean Squared Error: 0.0317

Could not evaluate MultiOutputClassifier: y must have at least two dimensions for multi-output regression but has only one.

Evaluation of MultinomialNB
Accuracy: 0.7840
Confusion Matrix:
 [[19027  9583]
 [  392 17180]]
Mean Squared Error: 0.2160

Evaluation of NearestCentroid
Accuracy: 0.8518
Confusion Matrix:
 [[24885  3725]
 [ 3118 14454]]
Mean Squared Error: 0.1482

Evaluation of NuSVC
Accuracy: 0.9017
Confusion Matrix:
 [[26004  2606]
 [ 1934 15638]]
Mean Squared Error: 0.0983



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of OneVsOneClassifier
Accuracy: 0.9413
Confusion Matrix:
 [[26979  1631]
 [ 1078 16494]]
Mean Squared Error: 0.0587



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of OneVsRestClassifier
Accuracy: 0.9413
Confusion Matrix:
 [[26979  1631]
 [ 1078 16494]]
Mean Squared Error: 0.0587



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Evaluation of OutputCodeClassifier
Accuracy: 0.9413
Confusion Matrix:
 [[26979  1631]
 [ 1078 16494]]
Mean Squared Error: 0.0587

Evaluation of PassiveAggressiveClassifier
Accuracy: 0.8791
Confusion Matrix:
 [[27468  1142]
 [ 4440 13132]]
Mean Squared Error: 0.1209

Evaluation of Perceptron
Accuracy: 0.9182
Confusion Matrix:
 [[26234  2376]
 [ 1402 16170]]
Mean Squared Error: 0.0818

Evaluation of QuadraticDiscriminantAnalysis
Accuracy: 0.7250
Confusion Matrix:
 [[16751 11859]
 [  842 16730]]
Mean Squared Error: 0.2750

Could not evaluate RadiusNeighborsClassifier: No neighbors found for test samples array([    2,     3,     4, ..., 46179, 46180, 46181]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.

Evaluation of RandomForestClassifier
Accuracy: 0.9682
Confusion Matrix:
 [[27532  1078]
 [  392 17180]]
Mean Squared Error: 0.0318

Evaluation of RidgeClassifier
Accuracy: 0.8778
Confusion Matrix:
 [[26261  2349]
 [ 3295 1427

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Evaluation of TunedThresholdClassifierCV
Accuracy: 0.9410
Confusion Matrix:
 [[26474  2136]
 [  589 16983]]
Mean Squared Error: 0.0590



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation of VotingClassifier
Accuracy: 0.9566
Confusion Matrix:
 [[27814   796]
 [ 1207 16365]]
Mean Squared Error: 0.0434



In [7]:
output_file = "sklearn_all_results.csv"
with open(output_file, mode="w", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=["Classifier", "Accuracy", "MSE", "TN", "FP", "FN", "TP"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_file}")

Results saved to sklearn_all_results.csv
