In [25]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler



from sklearn.utils import all_estimators


In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime

import sys
sys.path.append("/Users/derekdewald/Documents/Python/Github_Repo/d_py_functions")

from DataSets import MNIST_SKLEARN
df,z = MNIST_SKLEARN(normalize=False,flatten=True,return_value='df')


Downloading MNIST from OpenML...


In [None]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import mlflow
import mlflow.sklearn


import mlflow
from sklearn.model_selection import train_test_split
from MLPipeline import apply_scaling,SKLearnModelList

def MLPipeline(df, 
               project_name,
               scaler,
               ml_model_type='regressor',
               target_column='Target',
               test_size=0.2):
    """
    Runs multiple scikit-learn estimators with MLflow tracking.

    Args:
        df (DataFrame): Input dataset.
        project_name (str): MLflow experiment name.
        scaler (str): One of None, 'normal', or 'standard'.
        ml_model_type (str): 'classifier', 'regressor', 'cluster', 'transformer'
        target_column (str): Name of the target column.
        test_size (float): Proportion of data used for testing.

    Returns:
        pd.DataFrame: Summary of model performance.
    """

    # Set MLflow experiment
    mlflow.set_experiment(project_name)

    # Prepare data
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Apply scaler
    X_train, X_test, _ = apply_scaling(X_train, X_test, scaler=scaler)

    # Get model list (you must have this function already working)
    model_list = SKLearnModelList()

    # Filter by model type
    model_list = model_list[model_list['Estimator Type'].str.contains(ml_model_type)]

    results = []

    for _, row in model_list.iterrows():
        name = row['Model Name']
        estimator_class = row['Estimator Class']
        print(f'Generating Predicition for {name}, started processing {datetime.datetime.now()}')
        try:
            start_time = time.time()
            model = estimator_class()

            with mlflow.start_run(run_name=name):
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                if ml_model_type == "classifier":
                    metric = accuracy_score(y_test, y_pred)
                    mlflow.log_metric("Accuracy", metric)
                else:
                    metric = mean_squared_error(y_test, y_pred) ** 0.5
                    mlflow.log_metric("RMSE", metric)

                mlflow.sklearn.log_model(model, name)
                mlflow.log_param("Model", name)
                mlflow.log_param("Training Time", round(time.time() - start_time, 2))

                results.append({
                    "Model": name,
                    "Metric": metric,
                    "Time (s)": round(time.time() - start_time, 2)
                })

        except Exception as e:
            print(f"{name} failed: {str(e)}")

    return pd.DataFrame(results)


results_df = MLPipeline(df,
                        project_name='MNIST_ML_Comparison',
                        scaler='normal',
                        ml_model_type='classifier',
                        target_column='Target',
                        test_size=0.2)

results_df

Generating Predicition for AdaBoostClassifier, started processing 2025-05-05 23:44:37.675229




Generating Predicition for BaggingClassifier, started processing 2025-05-05 23:45:04.065514




Generating Predicition for BernoulliNB, started processing 2025-05-05 23:46:11.727411




Generating Predicition for CalibratedClassifierCV, started processing 2025-05-05 23:46:13.522742




Generating Predicition for CategoricalNB, started processing 2025-05-05 23:47:53.362855
CategoricalNB failed: index 64 is out of bounds for axis 1 with size 1
Generating Predicition for ClassifierChain, started processing 2025-05-05 23:47:54.556771
ClassifierChain failed: ClassifierChain.__init__() missing 1 required positional argument: 'base_estimator'
Generating Predicition for ComplementNB, started processing 2025-05-05 23:47:54.556816




Generating Predicition for DecisionTreeClassifier, started processing 2025-05-05 23:47:55.882979




Generating Predicition for DummyClassifier, started processing 2025-05-05 23:48:07.977726




Generating Predicition for ExtraTreeClassifier, started processing 2025-05-05 23:48:09.214000




Generating Predicition for ExtraTreesClassifier, started processing 2025-05-05 23:48:10.876368




Generating Predicition for FixedThresholdClassifier, started processing 2025-05-05 23:48:30.780123
FixedThresholdClassifier failed: FixedThresholdClassifier.__init__() missing 1 required positional argument: 'estimator'
Generating Predicition for GaussianNB, started processing 2025-05-05 23:48:30.780246




Generating Predicition for GaussianProcessClassifier, started processing 2025-05-05 23:48:32.638037


In [7]:

def MLPipeline(df, 
               project_name,
               scaler,
               ml_model_type='regressor',
               target_column='Target',
               test_size=0.2):
    """
    Runs multiple ML algorithms, tracks results with MLflow, and saves models.

    Args:
        df (dataframe)
        project_name (str):
        scaler (str): None, normal,standard
        ml_model_type (str): Option to pronpt all_estimators as to what model type requested.
        classifier, regressor, cluster, transformer

    Returns:
        None (Results are logged in MLflow)
    """
    # Set up MLflow experiment
    mlflow.set_experiment(project_name)

    # Prepare data
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Apply Scaler as Necessary
    X_train, X_test, scaler = apply_scaling(X_train, X_test, scaler=scaler)
    
    # Get all available models
    model_list = SKLearnModelList(ml_model_type)

    results = []

    for idx, row in model_list.iterrows():
        try:
            name = row['Model Name']
            model = row['Estimator Class']
            start_time = time.time()

             with mlflow.start_run(run_name=m_name):  # Start MLflow run
                 model = model()
                 model.fit(X_train, y_train)  # Train model
                 y_pred = model.predict(X_test)  # Predict

                # Evaluate performance
                if ml_model_type == "classifier":
                    metric = accuracy_score(y_test, y_pred)
                    mlflow.log_metric("Accuracy", metric)
                else:
                    metric = mean_squared_error(y_test, y_pred) ** 0.5  # RMSE manually computed
                    mlflow.log_metric("RMSE", metric)

                # Log model
                mlflow.sklearn.log_model(model, name)

                # Log metadata
                mlflow.log_param("Model", name)
                mlflow.log_param("Training Time", round(time.time() - start_time, 2))

                # Append results
                results.append({
                    "Model": name,
                    "Metric": metric,
                    "Time (s)": round(time.time() - start_time, 2)
                })

        except Exception as e:
            print(f"{name} failed: {str(e)}")  # Handle errors but continue

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df#.sort_values(by="Metric", ascending=(task_type == "regression"))

    return results_df
    


Unnamed: 0,Model Name,Estimator Class,Full Class Path,Estimator Type,Part_1,Part_2,Part_3,Part_4,Part_5
1,AdaBoostClassifier,<class 'sklearn.ensemble._weight_boosting.AdaB...,sklearn.ensemble._weight_boosting.AdaBoostClas...,classifier,sklearn,ensemble,_weight_boosting,AdaBoostClassifier,
6,BaggingClassifier,<class 'sklearn.ensemble._bagging.BaggingClass...,sklearn.ensemble._bagging.BaggingClassifier,classifier,sklearn,ensemble,_bagging,BaggingClassifier,
10,BernoulliNB,<class 'sklearn.naive_bayes.BernoulliNB'>,sklearn.naive_bayes.BernoulliNB,classifier,sklearn,naive_bayes,BernoulliNB,,
16,CalibratedClassifierCV,<class 'sklearn.calibration.CalibratedClassifi...,sklearn.calibration.CalibratedClassifierCV,classifier,sklearn,calibration,CalibratedClassifierCV,,
17,CategoricalNB,<class 'sklearn.naive_bayes.CategoricalNB'>,sklearn.naive_bayes.CategoricalNB,classifier,sklearn,naive_bayes,CategoricalNB,,
18,ClassifierChain,<class 'sklearn.multioutput.ClassifierChain'>,sklearn.multioutput.ClassifierChain,classifier,sklearn,multioutput,ClassifierChain,,
20,ComplementNB,<class 'sklearn.naive_bayes.ComplementNB'>,sklearn.naive_bayes.ComplementNB,classifier,sklearn,naive_bayes,ComplementNB,,
23,DecisionTreeClassifier,<class 'sklearn.tree._classes.DecisionTreeClas...,sklearn.tree._classes.DecisionTreeClassifier,classifier,sklearn,tree,_classes,DecisionTreeClassifier,
27,DummyClassifier,<class 'sklearn.dummy.DummyClassifier'>,sklearn.dummy.DummyClassifier,classifier,sklearn,dummy,DummyClassifier,,
33,ExtraTreeClassifier,<class 'sklearn.tree._classes.ExtraTreeClassif...,sklearn.tree._classes.ExtraTreeClassifier,classifier,sklearn,tree,_classes,ExtraTreeClassifier,


In [10]:
c_models = SKLearnModelList('classifier')


  

AdaBoostClassifier <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
BaggingClassifier <class 'sklearn.ensemble._bagging.BaggingClassifier'>
BernoulliNB <class 'sklearn.naive_bayes.BernoulliNB'>
CalibratedClassifierCV <class 'sklearn.calibration.CalibratedClassifierCV'>
CategoricalNB <class 'sklearn.naive_bayes.CategoricalNB'>
ClassifierChain <class 'sklearn.multioutput.ClassifierChain'>
ComplementNB <class 'sklearn.naive_bayes.ComplementNB'>
DecisionTreeClassifier <class 'sklearn.tree._classes.DecisionTreeClassifier'>
DummyClassifier <class 'sklearn.dummy.DummyClassifier'>
ExtraTreeClassifier <class 'sklearn.tree._classes.ExtraTreeClassifier'>
ExtraTreesClassifier <class 'sklearn.ensemble._forest.ExtraTreesClassifier'>
FixedThresholdClassifier <class 'sklearn.model_selection._classification_threshold.FixedThresholdClassifier'>
GaussianNB <class 'sklearn.naive_bayes.GaussianNB'>
GaussianProcessClassifier <class 'sklearn.gaussian_process._gpc.GaussianProcessClassifier'>
Gra

In [8]:
import inspect

model_params = inspect.signature('AdaBoostClassifier').parameters

TypeError: 'AdaBoostClassifier' is not a callable object