In [1]:
import awswrangler as wr

import mlflow

# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


# Entrenamos el modelo con Decision Tree Classifier

Probaremos el modelo con DTC, con búsqueda de hiperparámetros mediante optuna.

In [2]:
mlflow_server = "http://localhost:5001"
mlflow.set_tracking_uri(mlflow_server)
s3_base_path = "s3://data/chicago/crimes/2024"

In [3]:
# Cargamos los datos para realizar nuestro estudio.
X_train =  wr.s3.read_csv(f"{s3_base_path}/final/X_train.csv")
y_train =  wr.s3.read_csv(f"{s3_base_path}/final/y_train.csv")

X_test =  wr.s3.read_csv(f"{s3_base_path}/final/X_test.csv")
y_test =  wr.s3.read_csv(f"{s3_base_path}/final/y_test.csv")

## Arrancamos a experimentar

In [4]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Creamos una función para mostrar métricas
def metrics(model, X_test, y_test):

    y_pred = model.predict(X_test)
    cr = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    # Graficamos matriz de confusion
    categories = y_test["fbi_code"].astype("category").cat.categories
    cm = confusion_matrix(y_test, y_pred, labels=categories)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=categories)
    fig, ax = plt.subplots(figsize=(15,15))
    ax.grid(False)
    disp.plot(ax=ax)
    plt.close(fig)

    return cr, fig

Antes de poder realizar experimentos, vamos a crear el experimento en MLFLow, pero para evitar desorden, vamos a usar una función que se fije primero si el experimento existe, si esto es así, devuelve su ID.

Además creamos el nombre del run padre con el que vamos a ir registrando las ejecuciones.

In [5]:
import datetime
from mlflow_aux import get_or_create_experiment
# Creemos el experimento
experiment_id = get_or_create_experiment("Chicago Crimes 2024")
print(experiment_id)
run_name_parent = "dtc_test_"  + datetime.datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

1


In [6]:
from mlflow.models import infer_signature
import optuna 
from optuna_aux import champion_callback, objective
from sklearn.tree import DecisionTreeClassifier

with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):
    # Creamos un estudio de Optuna
    # Optuna es un poco verboso, dejamos que solo nos muestre logs de errores
    optuna.logging.set_verbosity(optuna.logging.ERROR)

    study = optuna.create_study(direction="maximize")
    # Ejecutamos los trials de optimización de hiperparametros. Cada uno de estos trials se ejecuta con un run separado, pero 
    # está anidado al run padre.
    # Notar la adición del `champion_callback` para controlar qué mensajes mostramos
    # Para entender mejor esto ver la documentación de objective y champion_callback en optuna_aux
    study.optimize(lambda trial: objective(trial, X_train, y_train["fbi_code"], experiment_id), n_trials=30, callbacks=[champion_callback])

    # Una vez que terminamos la búsqueda, guardamos los mejores parámetros en el run padre.
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_train_f1_weighted", study.best_value)

    mlflow.set_tags(
        tags={
            "project": "Chicago Crimes 2024",
            "optimizer_engine": "optuna",
            "model_family": "sklearn",
            "feature_set_version": 1,
        }
    )

    # Creamos el arbol con los mejores parámetros obtenidos
    dec_tree_class_best = DecisionTreeClassifier(**study.best_params, random_state=42)
    # Entrenamos
    dec_tree_class_best.fit(X_train, y_train["fbi_code"])

    # Testeamos el modelo y logueamos el resultado
    cr, cm = metrics(dec_tree_class_best, X_test, y_test)

    # Guardamos el artefacto del modelo
    artifact_path = "model"

    signature = infer_signature(X_train, dec_tree_class_best.predict(X_train))

    model_info = mlflow.sklearn.log_model(
        sk_model=dec_tree_class_best,
        name=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name="dtc_model_dev",
        metadata={"model_data_version": 1}
    )

    # Logging all metrics in classification_report
    mlflow.log_metric("test_accuracy", cr.pop("accuracy"), model_id=model_info.model_id)
    for class_or_avg, metrics_dict in cr.items():
        for metric, value in metrics_dict.items():
            mlflow.log_metric("test_" + class_or_avg + '_' + metric,value, model_id=model_info.model_id)
    mlflow.log_figure(figure=cm, artifact_file="fda_cm.png")

    # Obtenemos la ubicación del modelo guardado en MLFlow
    model_uri = mlflow.get_artifact_uri(artifact_path)




🏃 View run Trial: 0 at: http://localhost:5001/#/experiments/1/runs/05fe783a315f4c12b4b7fa4117ba61ad
🧪 View experiment at: http://localhost:5001/#/experiments/1
Initial trial 0 achieved value: 0.9987661488142277




🏃 View run Trial: 1 at: http://localhost:5001/#/experiments/1/runs/d01c391ceb0c4c2382d4e59f63c0a50a
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 1 achieved value: 0.9992597586803841 with  0.0494% improvement




🏃 View run Trial: 2 at: http://localhost:5001/#/experiments/1/runs/41737a23f9c8451490c563e804659cb9
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 3 at: http://localhost:5001/#/experiments/1/runs/20d53a22f93e41128e8291fc3be519b5
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 3 achieved value: 0.9994773616464219 with  0.0218% improvement




🏃 View run Trial: 4 at: http://localhost:5001/#/experiments/1/runs/e8441bfb3c614cee89c01bff137fc409
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 4 achieved value: 0.999756341142044 with  0.0279% improvement




🏃 View run Trial: 5 at: http://localhost:5001/#/experiments/1/runs/97c16c1f08bd44c481b12bbadea67b22
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 6 at: http://localhost:5001/#/experiments/1/runs/ff947bae23974788bc2bd644988a9494
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 7 at: http://localhost:5001/#/experiments/1/runs/451818a0ed454aab877a3d7500089550
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 8 at: http://localhost:5001/#/experiments/1/runs/9a9dbdc685914a899af4925250401948
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 9 at: http://localhost:5001/#/experiments/1/runs/5ec72a89dfec4e7b8e88591cfe05475e
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 10 at: http://localhost:5001/#/experiments/1/runs/3b8ce006c11c414cac2c8baca91eec3c
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 10 achieved value: 0.9997982076837062 with  0.0042% improvement




🏃 View run Trial: 11 at: http://localhost:5001/#/experiments/1/runs/370d27187464423fb1c439861f955b55
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 12 at: http://localhost:5001/#/experiments/1/runs/7b12cc85bab74d7bb8c1f9c163134114
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 12 achieved value: 0.9998102933298189 with  0.0012% improvement




🏃 View run Trial: 13 at: http://localhost:5001/#/experiments/1/runs/d1799bfcadd340b091c56247cd7332f5
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 14 at: http://localhost:5001/#/experiments/1/runs/07df1815092d40c78e56d1940e2019f1
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 15 at: http://localhost:5001/#/experiments/1/runs/7382b6ce25af4939845ab68117f83e79
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 16 at: http://localhost:5001/#/experiments/1/runs/93c7d909c456421c8d3ab4650c4a86f8
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 17 at: http://localhost:5001/#/experiments/1/runs/8996732634204085b57b01d65784d877
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 18 at: http://localhost:5001/#/experiments/1/runs/99a3bd9ade00410f83e359dc143a42ed
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 19 at: http://localhost:5001/#/experiments/1/runs/9617026f9a074505ac52f80565ff23e3
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 20 at: http://localhost:5001/#/experiments/1/runs/a11481ce4c2b477c9571c55d0009a902
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 20 with no changes.




🏃 View run Trial: 21 at: http://localhost:5001/#/experiments/1/runs/6364b0f5f9e14b1cb43a6fea748234e1
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 22 at: http://localhost:5001/#/experiments/1/runs/5d60d1e473dc4cc0b5966f6a81690839
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 23 at: http://localhost:5001/#/experiments/1/runs/edc3f8c172c6406dbc78c01794707ddd
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 24 at: http://localhost:5001/#/experiments/1/runs/96f46b7635014e11a82a822999369701
🧪 View experiment at: http://localhost:5001/#/experiments/1
Trial 24 achieved value: 0.9998462778464834 with  0.0036% improvement




🏃 View run Trial: 25 at: http://localhost:5001/#/experiments/1/runs/449a0f9e93c840948a1c2f37e61a8c0c
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 26 at: http://localhost:5001/#/experiments/1/runs/4b5d0630869542dc8d09c6b59377ebfa
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 27 at: http://localhost:5001/#/experiments/1/runs/890fb8fdfb904752b84516c751b5ac9a
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 28 at: http://localhost:5001/#/experiments/1/runs/06d43e102aea4c5b9601387d26a716de
🧪 View experiment at: http://localhost:5001/#/experiments/1




🏃 View run Trial: 29 at: http://localhost:5001/#/experiments/1/runs/d777d3d2a8f14ef5a3b6e601e62f4b4c
🧪 View experiment at: http://localhost:5001/#/experiments/1


Registered model 'dtc_model_dev' already exists. Creating a new version of this model...
2025/08/18 15:49:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dtc_model_dev, version 4
Created version '4' of model 'dtc_model_dev'.


🏃 View run dtc_test_2025/08/18-15:47:47" at: http://localhost:5001/#/experiments/1/runs/4b950cd1958c43c7b252e9f98041be5d
🧪 View experiment at: http://localhost:5001/#/experiments/1


## Registramos el modelo 

Realizamos el registro del modelo en MLflow. En este registro se pone el modelo productivo que luego se usará para servir en formato on-line.

In [7]:
from mlflow import MlflowClient

client = MlflowClient()
name = "dtc_model_prod"
desc = "This classifier show FBI code based on Chicago Police Report"

# Creamos el modelo productivo, si existe, lo actualizamos.
try:
    client.create_registered_model(name=name, description=desc)
except:
    client.update_registered_model(name=name, description=desc)

# Guardamos como tag los hiper-parametros en la version del modelo
tags = dec_tree_class_best.get_params()
tags["model"] = type(dec_tree_class_best).__name__
tags["f1-score"] = cr["weighted avg"]["f1-score"]

# Guardamos la version del modelo
result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

# Y creamos como la version con el alias de champion para poder levantarlo en nuestro
# proceso de servicio del modelo on-line.
client.set_registered_model_alias(name, "champion", result.version)

2025/08/18 15:49:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: dtc_model_prod, version 4
