In [1]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

import mlflow
from mlflow.models import infer_signature

import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier

import kagglehub
from kagglehub import KaggleDatasetAdapter
import matplotlib.pyplot as plt

file_path = "wine_quality_classification.csv"


df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "sahideseker/wine-quality-classification",
  file_path,
  
)




  from .autonotebook import tqdm as notebook_tqdm
  df = kagglehub.load_dataset(


## Data Prep

In [2]:
quality_order = ["low", "medium", "high"]  
encoder = OrdinalEncoder(
    categories=[quality_order],
    handle_unknown='use_encoded_value',  
    unknown_value=-1  
)
y_encoded = encoder.fit_transform(df[['quality_label']]).ravel()


X = df.drop(columns="quality_label")
y = y_encoded
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  
)


## Models

In [3]:

params_lr = {
    "solver": "lbfgs",
    "max_iter": 10000,
    "random_state": 8888,

    "class_weight": "balanced",  
    "penalty": "l2",
    "C": 0.1  
}

lr = LogisticRegression(**params_lr)
lr.fit(X_train, y_train)


y_pred_lr = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_lr)



y_proba = lr.predict_proba(X_test)
report_dict_lr = classification_report(y_test, y_pred_lr, output_dict=True)


In [4]:

params_rf = {
    "n_estimators": 30,
    "max_depth": 3
}
rf_clf = RandomForestClassifier(**params_rf)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
report_dict_rf = classification_report(y_test, y_pred_rf, output_dict=True)

# MLflow structure

In [5]:
models = {"LogisticRegression": lr, "RandomForest": rf_clf}
params = {"LogisticRegression": params_lr, "RandomForest": params_rf}
report_dict = {"LogisticRegression": report_dict_lr, "RandomForest": report_dict_rf}
wine_feature_names = list(X_train.columns)

## Single Experiment with multiple runs

## Approach 1: Single Experiment with Multiple Runs
- **Description**: One experiment (`MLflow_Wine_Single`) with separate runs for each model (LogisticRegression, RandomForest).

### Pros
- **Simplicity**: Easy to set up and manage within a single experiment.
- **Comparison**: Facilitates direct comparison of models in the MLflow UI (e.g., metrics like accuracy, f1-score).
- **Organization**: All runs are grouped under one experiment, ideal for related models.
- **Scalability**: Works well for adding more models by creating additional runs.

### Cons
- **Clutter**: Can become crowded with many models or runs.
- **Less Isolation**: Model runs are mixed, which may confuse unrelated experiments.
- **Limited Hierarchy**: No parent-child structure for grouping related runs (unlike nested runs).

In [6]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("1-MLflow Wine")


for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params[model_name])
        
        mlflow.log_metrics({
            "accuracy": report_dict[model_name]["accuracy"],
            "recall_class_0": report_dict[model_name]["0.0"]["recall"],
            "recall_class_1": report_dict[model_name]["1.0"]["recall"],
            "recall_class_2": report_dict[model_name]["2.0"]["recall"],
            "f1-score": report_dict[model_name]["macro avg"]["f1-score"]
        })
        
        mlflow.set_tag("Single Experiment/ Multiple Runs Training info", f"{model_name} model for wine")
        
        signature = infer_signature(X_train, model.predict(X_train))
        
        model_info = mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=f"{model_name.lower()}_model",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"tracking-wine-{model_name.lower()}"
            )
        
        
        predictions = model.predict(X_test)
        result = pd.DataFrame(X_test, columns=wine_feature_names)
        result["actual_class"] = y_test
        result['predicted_class'] = predictions
        result.to_csv(f"{model_name}_predictions.csv")
        mlflow.log_artifact(f"{model_name}_predictions.csv")
        
        
        
        
        plt.figure()
        if model_name == "LogisticRegression":
            
            coef = model.coef_[0]  
            plt.bar(wine_feature_names, coef)
            plt.title("LogisticRegression Coefficients")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig("logistic_coefficients.png")
            mlflow.log_artifact("logistic_coefficients.png")
        elif model_name == "RandomForest":
            
            importances = model.feature_importances_
            plt.bar(wine_feature_names, importances)
            plt.title("RandomForest Feature Importance")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig("feature_importance.png")
            mlflow.log_artifact("feature_importance.png")
        plt.close()

2025/04/29 17:11:50 INFO mlflow.tracking.fluent: Experiment with name '1-MLflow Wine' does not exist. Creating a new experiment.
Registered model 'tracking-wine-logisticregression' already exists. Creating a new version of this model...
2025/04/29 17:11:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-wine-logisticregression, version 6
Created version '6' of model 'tracking-wine-logisticregression'.


🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/205086602077169667/runs/02601672baf948c987a6626711e0ffa1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/205086602077169667


Registered model 'tracking-wine-randomforest' already exists. Creating a new version of this model...
2025/04/29 17:12:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-wine-randomforest, version 4
Created version '4' of model 'tracking-wine-randomforest'.


🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/205086602077169667/runs/116034f80d6a461b8bf5e29b20350a3f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/205086602077169667


## Nested Runs

## Approach 2: Nested Runs
- **Description**: One experiment (`MLflow_Wine_Nested`) with a parent run (`Wine_Model_Comparison`) and nested runs for each model.

### Pros
- **Hierarchy**: Nested runs provide a clear parent-child structure, grouping related models under one experiment.
- **Context**: Parent run can log shared metrics or artifacts (e.g., dataset info).
- **Comparison**: Nested runs are still comparable in the MLflow UI within the parent context.
- **Flexibility**: Ideal for experiments with model variants or hyperparameter tuning.

### Cons
- **Complexity**: Slightly more complex to set up due to nested run management.
- **UI Navigation**: Nested runs may be less intuitive to navigate in the MLflow UI for some users.
- **Overhead**: Parent run adds minor overhead if not used for shared logging.

In [7]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("2-MLflow_Wine_Nested")

with mlflow.start_run(run_name="Wine_Model_Comparison"):
    for model_name, model in models.items():
        with mlflow.start_run(run_name=model_name, nested=True):
            mlflow.log_params(params[model_name])
            mlflow.log_metrics({
                "accuracy": report_dict[model_name]["accuracy"],
                "recall_class_0": report_dict[model_name]["0.0"]["recall"],
                "recall_class_1": report_dict[model_name]["1.0"]["recall"],
                "recall_class_2": report_dict[model_name]["2.0"]["recall"],
                "f1-score": report_dict[model_name]["macro avg"]["f1-score"]
            })
            mlflow.set_tag("Training Info", f"{model_name} model for Wine")
            signature = infer_signature(X_train, model.predict(X_train))
            model_info = mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path=f"{model_name.lower()}_model",
                signature=signature,
                input_example=X_train,
                registered_model_name=f"tracking-wine-{model_name.lower()}"
            )
            predictions = model.predict(X_test)
            result = pd.DataFrame(X_test, columns=wine_feature_names).drop(columns="quality_label", errors="ignore")
            result["actual_class"] = y_test
            result["predicted_class"] = predictions
            result.to_csv(f"{model_name}_predictions.csv")
            mlflow.log_artifact(f"{model_name}_predictions.csv")
            
            
            
            plt.figure()
        if model_name == "LogisticRegression":
            
            coef = model.coef_[0]  
            plt.bar(wine_feature_names, coef)
            plt.title("LogisticRegression Coefficients")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig("logistic_coefficients.png")
            mlflow.log_artifact("logistic_coefficients.png")
        elif model_name == "RandomForest":
            
            importances = model.feature_importances_
            plt.bar(wine_feature_names, importances)
            plt.title("RandomForest Feature Importance")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig("feature_importance.png")
            mlflow.log_artifact("feature_importance.png")
        plt.close()

2025/04/29 17:12:00 INFO mlflow.tracking.fluent: Experiment with name '2-MLflow_Wine_Nested' does not exist. Creating a new experiment.
Registered model 'tracking-wine-logisticregression' already exists. Creating a new version of this model...
2025/04/29 17:12:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-wine-logisticregression, version 7
Created version '7' of model 'tracking-wine-logisticregression'.


🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/462244724701110143/runs/cd3662e765fd418fbd6cfe4900a26c47
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/462244724701110143


Registered model 'tracking-wine-randomforest' already exists. Creating a new version of this model...
2025/04/29 17:12:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-wine-randomforest, version 5
Created version '5' of model 'tracking-wine-randomforest'.


🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/462244724701110143/runs/42c52b0322fd43339696754bb01f2f82
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/462244724701110143
🏃 View run Wine_Model_Comparison at: http://127.0.0.1:5000/#/experiments/462244724701110143/runs/0d027e908d9c460b89a3408cc0085e60
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/462244724701110143


## Separate Experiments for Each Model

## Approach 3: Separate Experiments for Each Model
- **Description**: Separate experiments (`MLflow_Wine_LogisticRegression`, `MLflow_Wine_RandomForest`) with one run per model.

### Pros
- **Isolation**: Each model’s logs are completely separate, reducing clutter.
- **Clarity**: Ideal for unrelated models or distinct datasets, as experiments are clearly demarcated.
- **Focus**: Simplifies tracking for projects where models are developed independently.

### Cons
- **Comparison Difficulty**: Comparing models across experiments is less straightforward in the MLflow UI (requires manual selection).
- **Management Overhead**: More experiments to create and track, especially with many models.
- **Redundancy**: Shared settings (e.g., dataset info) must be logged in each experiment.

In [8]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

for model_name, model in models.items():
    mlflow.set_experiment(f"3-Separate Experiments per Model{model_name}")
    with mlflow.start_run(run_name=f"{model_name}_Run"):
        mlflow.log_params(params[model_name])
        mlflow.log_metrics({
            "accuracy": report_dict[model_name]["accuracy"],
            "recall_class_0": report_dict[model_name]["0.0"]["recall"],
            "recall_class_1": report_dict[model_name]["1.0"]["recall"],
            "recall_class_2": report_dict[model_name]["2.0"]["recall"],
            "f1-score": report_dict[model_name]["macro avg"]["f1-score"]
        })
        mlflow.set_tag("Training Info", f"{model_name} model for Wine")
        signature = infer_signature(X_train, model.predict(X_train))
        model_info = mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=f"{model_name.lower()}_model",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"tracking-wine-{model_name.lower()}"
        )
        predictions = model.predict(X_test)
        result = pd.DataFrame(X_test, columns=wine_feature_names).drop(columns="quality_label", errors="ignore")
        result["actual_class"] = y_test
        result["predicted_class"] = predictions
        result.to_csv(f"{model_name}_predictions.csv")
        mlflow.log_artifact(f"{model_name}_predictions.csv")
        
        plt.figure()
        if model_name == "LogisticRegression":
            
            coef = model.coef_[0]  
            plt.bar(wine_feature_names, coef)
            plt.title("LogisticRegression Coefficients")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig("logistic_coefficients.png")
            mlflow.log_artifact("logistic_coefficients.png")
        elif model_name == "RandomForest":
            
            importances = model.feature_importances_
            plt.bar(wine_feature_names, importances)
            plt.title("RandomForest Feature Importance")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig("feature_importance.png")
            mlflow.log_artifact("feature_importance.png")
        plt.close()

2025/04/29 17:12:10 INFO mlflow.tracking.fluent: Experiment with name '3-Separate Experiments per ModelLogisticRegression' does not exist. Creating a new experiment.
Registered model 'tracking-wine-logisticregression' already exists. Creating a new version of this model...
2025/04/29 17:12:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-wine-logisticregression, version 8
Created version '8' of model 'tracking-wine-logisticregression'.
2025/04/29 17:12:15 INFO mlflow.tracking.fluent: Experiment with name '3-Separate Experiments per ModelRandomForest' does not exist. Creating a new experiment.


🏃 View run LogisticRegression_Run at: http://127.0.0.1:5000/#/experiments/129440219345249374/runs/a7120a7edda64f46b409e867d92f8f1b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/129440219345249374


Registered model 'tracking-wine-randomforest' already exists. Creating a new version of this model...
2025/04/29 17:12:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-wine-randomforest, version 6
Created version '6' of model 'tracking-wine-randomforest'.


🏃 View run RandomForest_Run at: http://127.0.0.1:5000/#/experiments/564859271321168209/runs/df5d66c44b0f4e719a40dfcea7d6a13f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/564859271321168209
