In [1]:
import awswrangler as wr

import mlflow
from mlflow import MlflowClient
from mlflow.models import infer_signature
import optuna

import pandas as pd

from sklearn.tree import DecisionTreeClassifier

from mlflow_aux import get_or_create_experiment
from optuna_aux import champion_callback, objective
from plots import plot_correlation_with_target, plot_information_gain_with_target, plot_feature_correlation

from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

import random
import datetime
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')
#configuraciones generales
pd.set_option('display.max_columns', None)

%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

n_samples_train = 1000

  from .autonotebook import tqdm as notebook_tqdm


env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [2]:
mlflow_server = "http://localhost:5000"

mlflow.set_tracking_uri(mlflow_server)

In [3]:
X_train =  wr.s3.read_csv("s3://data/final/train/weather_X_train.csv")
y_train =  wr.s3.read_csv("s3://data/final/train/weather_y_train.csv")

X_test =  wr.s3.read_csv("s3://data/final/test/weather_X_test.csv")
y_test =  wr.s3.read_csv("s3://data/final/test/weather_y_test.csv")

In [4]:
features_list = list(X_train.columns.values)
features_list

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday',
 'Year',
 'Month',
 'Day',
 'Latitude',
 'Longitude',
 'WindGustDir_sin',
 'WindGustDir_cos',
 'WindDir9am_sin',
 'WindDir9am_cos',
 'WindDir3pm_sin',
 'WindDir3pm_cos']

In [5]:
correlation_plot = plot_correlation_with_target(X_train, y_train, target_col='RainTomorrow')
information_gain_plot = plot_information_gain_with_target(X_train, y_train, target_col='RainTomorrow')
features_correlation_plot = plot_feature_correlation(X_train)

In [6]:
experiment_id = get_or_create_experiment("Rain in Australia")
print(experiment_id)

run_name_parent = "best_hyperparam_"  + datetime.datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

1


In [7]:
if n_samples_train > 0:

    indices = random.sample(range(len(X_train)), n_samples_train)

    X_train = X_train.iloc[indices].reset_index(drop=True)
    y_train = y_train.iloc[indices].reset_index(drop=True)


In [8]:
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):
    # Inicializamos el estudio de Optuna
    study = optuna.create_study(direction="maximize")

    # Ejecutamos los trials de optimización de hiperparametros. Cada uno de estos trials se ejecuta con un run separado, pero 
    # está anidado al run padre.
    # Notar la adición del `champion_callback` para controlar qué mensajes mostramos
    # Para entender mejor esto ver la documentación de objective y champion_callback en optuna_aux
    study.optimize(lambda trial: objective(trial, X_train, y_train, experiment_id), n_trials=250, callbacks=[champion_callback])

    # Una vez que terminamos la búsqueda, guardamos los mejores parámetros en el run padre.
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_train_f1", study.best_value)

    mlflow.set_tags(
        tags={
            "project": "Rain in Australia",
            "optimizer_engine": "optuna",
            "model_family": "sklearn",
            "feature_set_version": 1,
        }
    )

    # Una vez que terminamos la búsqueda, nos quedamos con el mejor modelo y lo entrenamos
    if study.best_params["classifier"] == "SVC_linear":
        model = SVC(C=study.best_params["svc_c"], kernel='linear', gamma='scale')
    elif study.best_params["classifier"] == "SVC_poly":
        model = SVC(C=study.best_params["svc_c"], kernel='poly', 
                    gamma='scale', degree=study.best_params["svc_poly_degree"])
    elif study.best_params["classifier"] == "SVC_rbf":
        model = SVC(C=study.best_params["svc_c"], kernel='rbf', gamma='scale')
    elif study.best_params["classifier"] == "DecisionTreeClassifier":
        model = DecisionTreeClassifier(max_depth=study.best_params["tree_max_depth"])
    else:
        model = RandomForestClassifier(max_depth=study.best_params["rf_max_depth"], 
                                       n_estimators=study.best_params["rf_n_estimators"])

    model = model.fit(X_train, y_train.to_numpy().ravel())

    # Y testeamos el modelo y logueamos el resultado
    y_pred = model.predict(X_test)
    f1_score = f1_score(y_test.to_numpy().ravel(), y_pred)
    mlflow.log_metric("test_f1", f1_score)

    # Logueamos los artefactos de las gráficas de correlación y de information_gain
    mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")
    mlflow.log_figure(figure=information_gain_plot, artifact_file="information_gain_plot.png")
    mlflow.log_figure(features_correlation_plot, artifact_file="features_correlation_plot.png")

    # Guardamos el artefacto del modelo
    artifact_path = "model"

    signature = infer_signature(X_train, model.predict(X_train))

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name="rain_australia_model_dev",
        metadata={"model_data_version": 1}
    )

    # Obtenemos la ubicación del modelo guardado en MLFlow
    model_uri = mlflow.get_artifact_uri(artifact_path)

[I 2024-08-24 15:56:43,887] A new study created in memory with name: no-name-2e6073cd-3e6d-4171-ab31-05ea08bb228a
2024/08/24 15:56:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 0 at: http://localhost:5000/#/experiments/1/runs/ba664526f4974f1e8f6b51d9c184f767.
2024/08/24 15:56:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 15:56:46,738] Trial 0 finished with value: 0.5983214089157952 and parameters: {'classifier': 'SVC_linear', 'svc_c': 0.056411409902975176}. Best is trial 0 with value: 0.5983214089157952.


Initial trial 0 achieved value: 0.5983214089157952


2024/08/24 15:56:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 1 at: http://localhost:5000/#/experiments/1/runs/8ec742836ccd4570aa37cd3dbdcbb53c.
2024/08/24 15:56:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 15:56:47,473] Trial 1 finished with value: 0.44394401404205325 and parameters: {'classifier': 'DecisionTreeClassifier', 'tree_max_depth': 4}. Best is trial 0 with value: 0.5983214089157952.
2024/08/24 15:56:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 2 at: http://localhost:5000/#/experiments/1/runs/c62986b0a4d140029af1a6fa0ee680d6.
2024/08/24 15:56:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 15:56:48,821] Trial 2 finished with value: 0.5857629267738315 and parameters: {'classifier': 'SVC_linear', 'svc_c': 0.02475804928042856}. Best is trial 0 with value: 0.5983214089157952.
2024

Trial 4 achieved value: 0.6315102830288468 with  5.2555% improvement


[I 2024-08-24 15:56:51,660] Trial 5 finished with value: 0.6063366844188762 and parameters: {'classifier': 'SVC_linear', 'svc_c': 0.44595627905779783}. Best is trial 4 with value: 0.6315102830288468.
2024/08/24 15:56:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 6 at: http://localhost:5000/#/experiments/1/runs/513195c4ecf54d1388121b61ec928c28.
2024/08/24 15:56:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 15:56:51,844] Trial 6 finished with value: 0.44610715405415996 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 6, 'rf_n_estimators': 3}. Best is trial 4 with value: 0.6315102830288468.
2024/08/24 15:56:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 7 at: http://localhost:5000/#/experiments/1/runs/c4729675ad794336b33c3a81c16e5c40.
2024/08/24 15:56:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experimen

Trial 8 achieved value: 0.6330028203422797 with  0.2358% improvement


2024/08/24 15:56:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 9 at: http://localhost:5000/#/experiments/1/runs/4cb359573b4043d7a454753d94efd861.
2024/08/24 15:56:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 15:56:52,832] Trial 9 finished with value: 0.1324878241151533 and parameters: {'classifier': 'SVC_rbf', 'svc_c': 0.20440238368076732}. Best is trial 8 with value: 0.6330028203422797.
2024/08/24 15:56:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 10 at: http://localhost:5000/#/experiments/1/runs/ffbd736f72fc4703b4254c8a8bd4942e.
2024/08/24 15:56:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 15:56:52,956] Trial 10 finished with value: 0.38459395261639295 and parameters: {'classifier': 'SVC_poly', 'svc_c': 5.256715903726432, 'svc_poly_degree': 6}. Best is trial 8 with value: 0.63300282

Trial 58 achieved value: 0.635943210612261 with  0.4624% improvement


2024/08/24 15:57:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 60 at: http://localhost:5000/#/experiments/1/runs/ff5d4403d43144408142108a3c694316.
2024/08/24 15:57:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 15:57:10,841] Trial 60 finished with value: 0.6330028203422797 and parameters: {'classifier': 'SVC_linear', 'svc_c': 11.419376042109839}. Best is trial 58 with value: 0.635943210612261.
2024/08/24 15:57:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run Trial: 61 at: http://localhost:5000/#/experiments/1/runs/ed2e322ea33f4e5c9ce86e08a9d50a92.
2024/08/24 15:57:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
[I 2024-08-24 15:57:10,977] Trial 61 finished with value: 0.6330028203422797 and parameters: {'classifier': 'SVC_linear', 'svc_c': 5.222479422404966}. Best is trial 58 with value: 0.635943210612261.
2024/08

In [9]:
client = MlflowClient()
name = "rain_australia_model_prod"
desc = "This classifier predicts whether it will rain tomorrow"

In [10]:
#client.delete_registered_model(name=name) #TODO: eliminar condicional

RestException: RESOURCE_DOES_NOT_EXIST: Registered Model with name=rain_australia_model_prod not found

In [11]:
# Creamos el modelo productivo
client.create_registered_model(name=name, description=desc)

# Guardamos como tag los hiper-parametros en la version del modelo
tags = model.get_params()
tags["model"] = type(model).__name__
tags["f1-score"] = f1_score

# Guardamos la version del modelo
result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

# Y creamos como la version con el alias de champion para poder levantarlo en nuestro
# proceso de servicio del modelo on-line.
client.set_registered_model_alias(name, "champion", result.version)

2024/08/24 16:03:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rain_australia_model_prod, version 1
