In [1]:
import awswrangler as wr

import mlflow

# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


# Entrenamos el modelo con DTC

Probaremos el modelo con DTC, con búsqueda de hiperparámetros mediante optuna.

In [2]:
mlflow_server = "http://localhost:5001"
mlflow.set_tracking_uri(mlflow_server)
s3_base_path = "s3://data/chicago/crimes/2024"

In [3]:
# Cargamos los datos para realizar nuestro estudio.
X_train =  wr.s3.read_csv(f"{s3_base_path}/final/X_train.csv")
y_train =  wr.s3.read_csv(f"{s3_base_path}/final/y_train.csv")

X_test =  wr.s3.read_csv(f"{s3_base_path}/final/X_test.csv")
y_test =  wr.s3.read_csv(f"{s3_base_path}/final/y_test.csv")

## Arrancamos a experimentar

In [4]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Creamos una función para mostrar métricas
def metrics(model, X_test, y_test):

    y_pred = model.predict(X_test)
    cr = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    # Graficamos matriz de confusion
    categories = y_test["fbi_code"].astype("category").cat.categories
    cm = confusion_matrix(y_test, y_pred, labels=categories)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=categories)
    fig, ax = plt.subplots(figsize=(15,15))
    ax.grid(False)
    disp.plot(ax=ax)
    plt.close(fig)

    return cr, fig

Antes de poder realizar experimentos, vamos a crear el experimento en MLFLow, pero para evitar desorden, vamos a usar una función que se fije primero si el experimento existe, si esto es así, devuelve su ID.

Además creamos el nombre del run padre con el que vamos a ir registrando las ejecuciones.

In [5]:
import datetime
from mlflow_aux import get_or_create_experiment
# Creemos el experimento
experiment_id = get_or_create_experiment("Chicago Crimes 2024")
print(experiment_id)
run_name_parent = "dtc_test"  + datetime.datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

1


In [6]:
from mlflow.models import infer_signature
import optuna 
from optuna_aux import champion_callback, objective
from sklearn.tree import DecisionTreeClassifier

with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):
    # Creamos un estudio de Optuna
    # Optuna es un poco verboso, dejamos que solo nos muestre logs de errores
    optuna.logging.set_verbosity(optuna.logging.ERROR)

    study = optuna.create_study(direction="maximize")
    # Ejecutamos los trials de optimización de hiperparametros. Cada uno de estos trials se ejecuta con un run separado, pero 
    # está anidado al run padre.
    # Notar la adición del `champion_callback` para controlar qué mensajes mostramos
    # Para entender mejor esto ver la documentación de objective y champion_callback en optuna_aux
    study.optimize(lambda trial: objective(trial, X_train, y_train["fbi_code"], experiment_id), n_trials=30, callbacks=[champion_callback])


    # Una vez que terminamos la búsqueda, guardamos los mejores parámetros en el run padre.
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_train_f1_weighted", study.best_value)

    mlflow.set_tags(
        tags={
            "project": "Chicago Crimes 2024",
            "optimizer_engine": "optuna",
            "model_family": "sklearn",
            "feature_set_version": 1,
        }
    )

    # Creamos el arbol
    dec_tree_class_best = DecisionTreeClassifier(**study.best_params, random_state=42)
    # Entrenamos
    dec_tree_class_best.fit(X_train, y_train["fbi_code"])

    # Testeamos el modelo y logueamos el resultado
    cr, cm = metrics(dec_tree_class_best, X_test, y_test)

    # Logging all metrics in classification_report
    mlflow.log_metric("test_accuracy", cr.pop("accuracy"))
    for class_or_avg, metrics_dict in cr.items():
        for metric, value in metrics_dict.items():
            mlflow.log_metric("test_" + class_or_avg + '_' + metric,value)
    mlflow.log_figure(figure=cm, artifact_file="fda_cm.png")

    # Guardamos el artefacto del modelo
    artifact_path = "model"

    signature = infer_signature(X_train, dec_tree_class_best.predict(X_train))

    mlflow.sklearn.log_model(
        sk_model=dec_tree_class_best,
        artifact_path=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name="dtc_model_dev",
        metadata={"model_data_version": 1}
    )

    # Obtenemos la ubicación del modelo guardado en MLFlow
    model_uri = mlflow.get_artifact_uri(artifact_path)




🏃 View run Trial: 0 at: http://localhost:5001/#/experiments/1/runs/82e33f8eac9d4ed6ad2628d37e119c90
🧪 View experiment at: http://localhost:5001/#/experiments/1
Initial trial 0 achieved value: 0.9990706655881241




🏃 View run Trial: 1 at: http://localhost:5001/#/experiments/1/runs/cf4651860a4f4faf9b15aef2f261205d
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 1 achieved value: 0.9997559307893031 with  0.0685% improvement




🏃 View run Trial: 2 at: http://localhost:5001/#/experiments/1/runs/68d6fc6d2c43420f93b68f2e991a1199
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 2 achieved value: 0.9998022298170177 with  0.0046% improvement




🏃 View run Trial: 3 at: http://localhost:5001/#/experiments/1/runs/b73ed2dd1685462b80788b45421dcd01
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 4 at: http://localhost:5001/#/experiments/1/runs/1d1078d8071e4970b39debd2a2d6542a
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 5 at: http://localhost:5001/#/experiments/1/runs/dc1c79efd79e4d76860a5be5ffdc5cdb
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 6 at: http://localhost:5001/#/experiments/1/runs/2b9112100bc94263a5f75fd3fb718b44
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 7 at: http://localhost:5001/#/experiments/1/runs/a7eadb20e0f3450bb0cfdb2aec871dfd
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 8 at: http://localhost:5001/#/experiments/1/runs/390e21c5968b45dc95d8b292fd8fa742
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 9 at: http://localhost:5001/#/experiments/1/runs/774c0d6e8fb64090809176416322b23b
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 10 at: http://localhost:5001/#/experiments/1/runs/120d7ce066644f65bcc995798c4dd911
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 10 with no changes.




🏃 View run Trial: 11 at: http://localhost:5001/#/experiments/1/runs/5d8a82b04377449c9f867edd2462a564
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 12 at: http://localhost:5001/#/experiments/1/runs/56fcc7ed1dd945eeada8cdd8a7d264af
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 13 at: http://localhost:5001/#/experiments/1/runs/17129b68eaeb43fa926573f841397ec7
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 14 at: http://localhost:5001/#/experiments/1/runs/a8412fb747334048a33bf87620d8427d
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 15 at: http://localhost:5001/#/experiments/1/runs/c4dfb1d1f4714904800285c5e77106c3
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 16 at: http://localhost:5001/#/experiments/1/runs/28e21b0540ec450680f3fa6c8352fe17
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 17 at: http://localhost:5001/#/experiments/1/runs/6cff8facbef54ea6957d35daf8920fed
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 18 at: http://localhost:5001/#/experiments/1/runs/d628ed257b9d49cf98b61d066750b2ba
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 19 at: http://localhost:5001/#/experiments/1/runs/b340978e8924457aaa9cbba1875052f5
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 20 at: http://localhost:5001/#/experiments/1/runs/ac8bc8f50759440cb45bf7726d275622
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 20 achieved value: 0.9998242155369628 with  0.0022% improvement




🏃 View run Trial: 21 at: http://localhost:5001/#/experiments/1/runs/06c23eb83bb8400aa8643f2200607301
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 22 at: http://localhost:5001/#/experiments/1/runs/7822d928862044aaad16efe972a9f194
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 23 at: http://localhost:5001/#/experiments/1/runs/0fa76dcc7a974e38a9a886b2337de0a0
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 24 at: http://localhost:5001/#/experiments/1/runs/0412adc3f3614802b793376171e4bc7b
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 24 achieved value: 0.9998518880893125 with  0.0028% improvement




🏃 View run Trial: 25 at: http://localhost:5001/#/experiments/1/runs/3adbe6a2aad848f29eb0645552059fd1
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 25 achieved value: 0.9998855971384681 with  0.0034% improvement




🏃 View run Trial: 26 at: http://localhost:5001/#/experiments/1/runs/a1c49bc150c24852b3a86285ee35edb3
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 27 at: http://localhost:5001/#/experiments/1/runs/f12610dc497f44d995e812ae4e18c7a0
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 28 at: http://localhost:5001/#/experiments/1/runs/0bdc9e80a22f47e5b24beb53d4d3cf0d
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 29 at: http://localhost:5001/#/experiments/1/runs/2ae6332ede4c4018ad35a6e250f7ec19
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 29 achieved value: 0.9998967859831958 with  0.0011% improvement


Registered model 'dtc_model_dev' already exists. Creating a new version of this model...
2025/08/03 20:52:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dtc_model_dev, version 4
Created version '4' of model 'dtc_model_dev'.


🏃 View run dtc_test2025/08/03-20:51:38" at: http://localhost:5001/#/experiments/1/runs/84a3e2d821c94ea39a072bf4b64077b7
🧪 View experiment at: http://localhost:5001/#/experiments/1


## Registramos el modelo 

Realizamos el registro del modelo en MLflow. En este registro se pone el modelo productivo que luego se usará para servir en formato on-line.

In [7]:
from mlflow import MlflowClient

client = MlflowClient()
name = "dtc_model_prod"
desc = "This classifier show FBI code based on Chicago Police Report"

# Creamos el modelo productivo
if client.get_registered_model(name):
    client.update_registered_model(name=name, description=desc)
else:
    client.create_registered_model(name=name, description=desc)

# Guardamos como tag los hiper-parametros en la version del modelo
tags = dec_tree_class_best.get_params()
tags["model"] = type(dec_tree_class_best).__name__
tags["f1-score"] = cr["weighted avg"]["f1-score"]

# Guardamos la version del modelo
result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

# Y creamos como la version con el alias de champion para poder levantarlo en nuestro
# proceso de servicio del modelo on-line.
client.set_registered_model_alias(name, "champion", result.version)

2025/08/03 20:52:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dtc_model_prod, version 3
