In [1]:
import awswrangler as wr

import mlflow
from mlflow import MlflowClient
from mlflow.models import infer_signature
import optuna

import pandas as pd

from sklearn.tree import DecisionTreeClassifier

from mlflow_aux import get_or_create_experiment
from optuna_aux import champion_callback, objective
from plots import plot_correlation_with_target, plot_information_gain_with_target, plot_feature_correlation

from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

import random
import datetime
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')
#configuraciones generales
pd.set_option('display.max_columns', None)

%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

n_samples_train = 1000

  from .autonotebook import tqdm as notebook_tqdm


env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [2]:
mlflow_server = "http://localhost:5000"

mlflow.set_tracking_uri(mlflow_server)

In [3]:
X_train =  wr.s3.read_csv("s3://data/final/train/weather_X_train.csv")
y_train =  wr.s3.read_csv("s3://data/final/train/weather_y_train.csv")

X_test =  wr.s3.read_csv("s3://data/final/test/weather_X_test.csv")
y_test =  wr.s3.read_csv("s3://data/final/test/weather_y_test.csv")

In [4]:
features_list = list(X_train.columns.values)
features_list

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday',
 'Year',
 'Month',
 'Day',
 'Latitude',
 'Longitude',
 'WindGustDir_sin',
 'WindGustDir_cos',
 'WindDir9am_sin',
 'WindDir9am_cos',
 'WindDir3pm_sin',
 'WindDir3pm_cos']

In [5]:
correlation_plot = plot_correlation_with_target(X_train, y_train, target_col='RainTomorrow')
information_gain_plot = plot_information_gain_with_target(X_train, y_train, target_col='RainTomorrow')
features_correlation_plot = plot_feature_correlation(X_train)

In [6]:
experiment_id = get_or_create_experiment("Rain in Australia")
print(experiment_id)

run_name_parent = "best_hyperparam_"  + datetime.datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

1


In [7]:
if n_samples_train > 0:

    indices = random.sample(range(len(X_train)), n_samples_train)

    X_train = X_train.iloc[indices].reset_index(drop=True)
    y_train = y_train.iloc[indices].reset_index(drop=True)


In [8]:
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):
    # Inicializamos el estudio de Optuna
    study = optuna.create_study(direction="maximize")

    # Ejecutamos los trials de optimización de hiperparametros. Cada uno de estos trials se ejecuta con un run separado, pero 
    # está anidado al run padre.
    # Notar la adición del `champion_callback` para controlar qué mensajes mostramos
    # Para entender mejor esto ver la documentación de objective y champion_callback en optuna_aux
    study.optimize(lambda trial: objective(trial, X_train, y_train, experiment_id), n_trials=250, callbacks=[champion_callback])

    # Una vez que terminamos la búsqueda, guardamos los mejores parámetros en el run padre.
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_train_f1", study.best_value)

    mlflow.set_tags(
        tags={
            "project": "Rain in Australia",
            "optimizer_engine": "optuna",
            "model_family": "sklearn",
            "feature_set_version": 1,
        }
    )

    # Una vez que terminamos la búsqueda, nos quedamos con el mejor modelo y lo entrenamos
    if study.best_params["classifier"] == "SVC_linear":
        model = SVC(C=study.best_params["svc_c"], kernel='linear', gamma='scale')
    elif study.best_params["classifier"] == "SVC_poly":
        model = SVC(C=study.best_params["svc_c"], kernel='poly', 
                    gamma='scale', degree=study.best_params["svc_poly_degree"])
    elif study.best_params["classifier"] == "SVC_rbf":
        model = SVC(C=study.best_params["svc_c"], kernel='rbf', gamma='scale')
    elif study.best_params["classifier"] == "DecisionTreeClassifier":
        model = DecisionTreeClassifier(max_depth=study.best_params["tree_max_depth"])
    else:
        model = RandomForestClassifier(max_depth=study.best_params["rf_max_depth"], 
                                       n_estimators=study.best_params["rf_n_estimators"])

    model = model.fit(X_train, y_train.to_numpy().ravel())

    # Y testeamos el modelo y logueamos el resultado
    y_pred = model.predict(X_test)
    f1_score = f1_score(y_test.to_numpy().ravel(), y_pred)
    mlflow.log_metric("test_f1", f1_score)

    # Logueamos los artefactos de las gráficas de correlación y de information_gain
    mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")
    mlflow.log_figure(figure=information_gain_plot, artifact_file="information_gain_plot.png")
    mlflow.log_figure(features_correlation_plot, artifact_file="features_correlation_plot.png")

    # Guardamos el artefacto del modelo
    artifact_path = "model"

    signature = infer_signature(X_train, model.predict(X_train))

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name="rain_australia_model_dev",
        metadata={"model_data_version": 1}
    )

    # Obtenemos la ubicación del modelo guardado en MLFlow
    model_uri = mlflow.get_artifact_uri(artifact_path)

[I 2024-08-24 13:49:36,790] A new study created in memory with name: no-name-435913e0-b0f1-4ddc-85fd-a6d6da98ee98
2024/08/24 13:49:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 0 at: http://localhost:5000/#/experiments/1/runs/1ea901c418d541e68d8efb1a63430a64.
2024/08/24 13:49:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:49:50,030] Trial 0 finished with value: 0.4868976530005941 and parameters: {'classifier': 'SVC_linear', 'svc_c': 0.027378616095938977}. Best is trial 0 with value: 0.4868976530005941.


Initial trial 0 achieved value: 0.4868976530005941


2024/08/24 13:49:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 1 at: http://localhost:5000/#/experiments/1/runs/cc5f260908ff4e3e9ee4fdacdee2234c.
2024/08/24 13:49:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:49:50,727] Trial 1 finished with value: 0.5542265681768567 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 20.0981422013929}. Best is trial 1 with value: 0.5542265681768567.


Trial 1 achieved value: 0.5542265681768567 with  12.1483% improvement


2024/08/24 13:49:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 2 at: http://localhost:5000/#/experiments/1/runs/1abaec424edd4bb8a1bd8f73b3a8b960.
2024/08/24 13:49:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:49:51,680] Trial 2 finished with value: 0.32935198363280555 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 19, 'rf_n_estimators': 4}. Best is trial 1 with value: 0.5542265681768567.
2024/08/24 13:49:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 3 at: http://localhost:5000/#/experiments/1/runs/f2924885d88b477ea570224171d9c016.
2024/08/24 13:49:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:49:52,297] Trial 3 finished with value: 0.41632282911890517 and parameters: {'classifier': 'DecisionTreeClassifier', 'tree_max_depth': 31}. Best is trial 1 with value: 0.554226

Trial 42 achieved value: 0.555126696806832 with  0.1621% improvement


2024/08/24 13:50:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 43 at: http://localhost:5000/#/experiments/1/runs/74496019a45e4bb58c9a88888511ef07.
2024/08/24 13:50:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:50:44,197] Trial 43 finished with value: 0.5527922595136323 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 21.2592170109474}. Best is trial 42 with value: 0.555126696806832.
2024/08/24 13:50:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 44 at: http://localhost:5000/#/experiments/1/runs/1b8250f133964382a339843e45ebb5b4.
2024/08/24 13:50:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:50:44,604] Trial 44 finished with value: 0.5483905794687141 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 46.00030741847793}. Best is trial 42 with value: 0.555126696806832.
2024/08/24 13:5

Trial 45 achieved value: 0.5576792127732895 with  0.4577% improvement


2024/08/24 13:50:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 46 at: http://localhost:5000/#/experiments/1/runs/9474b5c413854ff5b2ae13427c31bfe2.
2024/08/24 13:50:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:50:45,431] Trial 46 finished with value: 0.5539014715816067 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 17.105366899377046}. Best is trial 45 with value: 0.5576792127732895.
2024/08/24 13:50:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 47 at: http://localhost:5000/#/experiments/1/runs/a7ce2c294d4642519319b40b9835b2fa.
2024/08/24 13:50:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:50:45,783] Trial 47 finished with value: 0.5483905794687141 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 83.13034389822722}. Best is trial 45 with value: 0.5576792127732895.
2024/08/24 

Trial 61 achieved value: 0.5591135214365137 with  0.2565% improvement


2024/08/24 13:50:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 62 at: http://localhost:5000/#/experiments/1/runs/47e2b86df88f462f80d876b0d3e906a5.
2024/08/24 13:50:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:50:53,343] Trial 62 finished with value: 0.5542265681768567 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 17.74688553446969}. Best is trial 61 with value: 0.5591135214365137.
2024/08/24 13:50:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 63 at: http://localhost:5000/#/experiments/1/runs/a84540737df54895bd7a77d56ecb93f9.
2024/08/24 13:50:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:50:53,672] Trial 63 finished with value: 0.5524526831751707 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 9.848676341007682}. Best is trial 61 with value: 0.5591135214365137.
2024/08/24 1

Trial 75 achieved value: 0.5596815582631185 with  0.1015% improvement


2024/08/24 13:51:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 76 at: http://localhost:5000/#/experiments/1/runs/6ba7b6a5e6d04d61b05ad118892a3a42.
2024/08/24 13:51:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:51:00,503] Trial 76 finished with value: 0.5568772863531557 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 7.810897496110417}. Best is trial 75 with value: 0.5596815582631185.
2024/08/24 13:51:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 77 at: http://localhost:5000/#/experiments/1/runs/8163ddac29254296a17844fd47d700ba.
2024/08/24 13:51:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:51:01,248] Trial 77 finished with value: 0.5596815582631185 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 7.455539633801706}. Best is trial 75 with value: 0.5596815582631185.
2024/08/24 1

Trial 121 achieved value: 0.5604329724176655 with  0.1341% improvement


2024/08/24 13:51:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 122 at: http://localhost:5000/#/experiments/1/runs/93a77273baaf43f48f3971ed046f0dcf.
2024/08/24 13:51:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:51:23,803] Trial 122 finished with value: 0.5498965630478844 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 10.536316740881883}. Best is trial 121 with value: 0.5604329724176655.
2024/08/24 13:51:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 123 at: http://localhost:5000/#/experiments/1/runs/1e3907aec9774124b2e5a71c56aa891a.
2024/08/24 13:51:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:51:24,503] Trial 123 finished with value: 0.55113957105913 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 5.80803821885512}. Best is trial 121 with value: 0.5604329724176655.
2024/08/

Trial 180 achieved value: 0.561090008967344 with  0.1171% improvement


2024/08/24 13:51:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 181 at: http://localhost:5000/#/experiments/1/runs/cc1b1ad3ce184eb8bba9370fd7b1f18e.
2024/08/24 13:51:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:51:52,621] Trial 181 finished with value: 0.5540240010574541 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 7.028099472439999}. Best is trial 180 with value: 0.561090008967344.
2024/08/24 13:51:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 182 at: http://localhost:5000/#/experiments/1/runs/fbe01d5b3e194e6ea17525acdc800d39.
2024/08/24 13:51:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 13:51:53,203] Trial 182 finished with value: 0.542270371455448 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 11.11320407810877}. Best is trial 180 with value: 0.561090008967344.
2024/08/2

In [9]:
client = MlflowClient()
name = "rain_australia_model_prod"
desc = "This classifier predicts whether it will rain tomorrow"

In [10]:
client.delete_registered_model(name=name)

In [11]:


# Creamos el modelo productivo
client.create_registered_model(name=name, description=desc)

# Guardamos como tag los hiper-parametros en la version del modelo
tags = model.get_params()
tags["model"] = type(model).__name__
tags["f1-score"] = f1_score

# Guardamos la version del modelo
result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

# Y creamos como la version con el alias de champion para poder levantarlo en nuestro
# proceso de servicio del modelo on-line.
client.set_registered_model_alias(name, "champion", result.version)

2024/08/24 13:52:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rain_australia_model_prod, version 1
