In [17]:
import datetime
import sys
import awswrangler as wr
import mlflow
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import (mean_absolute_error, r2_score,
                             root_mean_squared_error, make_scorer,
                             mean_absolute_percentage_error, mean_squared_error)

from scipy.stats import uniform, randint
from xgboost import XGBRegressor
from mlflow.models import infer_signature
from mlflow_aux import get_or_create_experiment

# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000
#%env MLFLOW_S3_ENDPOINT_URL=http://192.168.0.21:9000
#%env AWS_ENDPOINT_URL_S3=http://192.168.0.21:9000

sys.path.append('../tp_amq2_17co2024')
from models.cars_pipeline import CarsPipeline

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [9]:
X_train = wr.s3.read_csv("s3://data/final/train/cars_X_train_processed.csv")
y_train = wr.s3.read_csv("s3://data/final/train/cars_y_train.csv")
X_test = wr.s3.read_csv("s3://data/final/test/cars_X_test_processed.csv")
y_test = wr.s3.read_csv("s3://data/final/test/cars_y_test.csv")

In [25]:
X_train

Unnamed: 0,year,km_driven,owner,seats,mileage_kmpl,engine_cc,max_power_bhp,torque_peak_power,torque_peak_speed,fuel_Diesel,...,make_Mitsubishi,make_Nissan,make_Opel,make_Peugeot,make_Renault,make_Skoda,make_Tata,make_Toyota,make_Volkswagen,make_Volvo
0,0.143297,0.052265,-0.702988,1.585859,-1.660468,3.124387,2.526617,1.509107,0.390960,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,4.209293,-0.157947,-0.040656
1,1.122097,-0.920472,-0.702988,-0.434970,0.168567,-0.892053,-0.628294,-0.745531,0.500417,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
2,1.122097,-0.265745,-0.702988,-0.434970,0.697217,-0.086336,0.022161,0.430983,-0.320516,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
3,1.122097,-0.920472,-0.702988,-0.434970,1.051223,-0.890028,-0.661273,-0.773979,1.321351,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,5.842126,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
4,-0.590803,0.669578,0.626219,-0.434970,-0.610248,-0.068116,-0.535641,-0.375345,-0.594161,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,3.141648,-0.237570,-0.157947,-0.040656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,-2.059004,0.295449,0.626219,-0.434970,-1.365463,-0.282704,-0.567049,-0.647440,-0.594161,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
4844,-1.080203,-0.059974,-0.702988,-0.434970,-0.822652,-0.489194,-0.095933,-0.574662,1.594995,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
4845,-0.346103,-0.335969,-0.702988,-0.434970,-0.232641,-0.489194,-0.070806,-0.565603,1.047706,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
4846,1.366797,-1.115991,-0.702988,-0.434970,0.383331,-0.489194,-0.196437,-0.574662,1.266622,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656


In [26]:
y_train

Unnamed: 0,selling_price_log
0,14.220976
1,12.765691
2,13.652993
3,12.899222
4,12.206078
...,...
4843,11.695255
4844,12.345835
4845,12.818555
4846,13.296318


In [27]:
X_test

Unnamed: 0,year,km_driven,owner,seats,mileage_kmpl,engine_cc,max_power_bhp,torque_peak_power,torque_peak_speed,fuel_Diesel,...,make_Mitsubishi,make_Nissan,make_Opel,make_Peugeot,make_Renault,make_Skoda,make_Tata,make_Toyota,make_Volkswagen,make_Volvo
0,-1.080203,3.288483,-0.702988,1.585859,-1.601467,2.136472,0.438000,0.214669,0.390960,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,4.209293,-0.157947,-0.040656
1,0.387997,-0.658581,-0.702988,1.585859,-0.846253,1.498781,1.631495,1.391329,-0.265787,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
2,1.122097,0.295449,-0.702988,-0.434970,0.805779,0.120155,0.342520,0.258845,0.226773,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
3,-1.324903,0.482513,1.955426,-0.434970,-0.539447,-0.764515,-0.755496,-0.834083,0.500417,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
4,1.122097,-0.639875,-0.702988,-0.434970,1.466591,0.120155,0.331213,0.213546,-1.415094,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2073,-2.793104,0.108384,1.955426,-1.445385,-0.822652,-1.300985,-1.603506,-1.063895,-0.594161,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
2074,1.122097,-0.920472,-0.702988,-0.434970,0.569774,-0.489194,-0.196437,-0.574662,1.266622,-1.102598,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
2075,0.387997,-0.546342,-0.702988,1.585859,-0.258602,0.110033,0.375184,0.485342,-0.867805,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,-0.171171,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656
2076,1.122097,-0.901765,-0.702988,-0.434970,0.003363,0.045251,0.640580,0.621240,-1.415094,0.906949,...,-0.045464,-0.096794,-0.014364,-0.014364,5.842126,-0.106132,-0.318304,-0.237570,-0.157947,-0.040656


In [28]:
y_test

Unnamed: 0,selling_price_log
0,13.217675
1,14.018452
2,13.652993
3,11.289794
4,13.623140
...,...
2073,10.596660
2074,13.235694
2075,13.560620
2076,13.764218


In [22]:
# Inspeccionar las primeras filas y dimensiones de X_train
print("Primeras filas de X_train:")
print(X_train.head())
print("\nShape de X_train:", X_train.shape)

# Inspeccionar las primeras filas y dimensiones de y_train
print("\nPrimeras filas de y_train:")
print(y_train.head())
print("\nShape de y_train:", y_train.shape)

# Inspeccionar las primeras filas y dimensiones de X_test
print("\nPrimeras filas de X_test:")
print(X_test.head())
print("\nShape de X_test:", X_test.shape)

# Inspeccionar las primeras filas y dimensiones de y_test
print("\nPrimeras filas de y_test:")
print(y_test.head())
print("\nShape de y_test:", y_test.shape)


Primeras filas de X_train:
       year  km_driven     owner     seats  mileage_kmpl  engine_cc  \
0  0.143297   0.052265 -0.702988  1.585859     -1.660468   3.124387   
1  1.122097  -0.920472 -0.702988 -0.434970      0.168567  -0.892053   
2  1.122097  -0.265745 -0.702988 -0.434970      0.697217  -0.086336   
3  1.122097  -0.920472 -0.702988 -0.434970      1.051223  -0.890028   
4 -0.590803   0.669578  0.626219 -0.434970     -0.610248  -0.068116   

   max_power_bhp  torque_peak_power  torque_peak_speed  fuel_Diesel  ...  \
0       2.526617           1.509107           0.390960     0.906949  ...   
1      -0.628294          -0.745531           0.500417    -1.102598  ...   
2       0.022161           0.430983          -0.320516     0.906949  ...   
3      -0.661273          -0.773979           1.321351    -1.102598  ...   
4      -0.535641          -0.375345          -0.594161     0.906949  ...   

   make_Mitsubishi  make_Nissan  make_Opel  make_Peugeot  make_Renault  \
0        -0.045

In [10]:
mlflow_server = "http://localhost:5002"
mlflow.set_tracking_uri(mlflow_server)

In [11]:
# Creemos el experimento
experiment_id = get_or_create_experiment("Cars")
print(experiment_id)

run_name_parent = "best_hyperparam_"  + datetime.datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

1


In [23]:
model = XGBRegressor(random_state=42)

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(1, 2)
}

xgb = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_mean_squared_error',
    random_state=42,
)

xgb.fit(X_train, y_train)

In [29]:
xgb_best_model = xgb.best_estimator_

y_pred_log = xgb_best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)

y_pred_train_log = xgb_best_model.predict(X_train)
y_pred_train = np.expm1(y_pred_train_log)

y_train_recovered = np.expm1(y_train)
y_test_recovered = np.expm1(y_test)

metrics = {
    "MAE_training": mean_absolute_error(y_train_recovered, y_pred_train), #
    "MAE": mean_absolute_error(y_test_recovered, y_pred),
    "RMSE": root_mean_squared_error(y_test_recovered, y_pred),
    "MAPE": mean_absolute_percentage_error(y_test_recovered, y_pred),
    "R2": r2_score(y_test_recovered, y_pred)
}
metrics

{'MAE_training': 57692.80648108756,
 'MAE': 71630.52777798666,
 'RMSE': 150170.362104489,
 'MAPE': 0.1582418683049689,
 'R2': 0.902574896812439}

In [30]:
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):

    # Una vez que terminamos la búsqueda, guardamos los mejores parámetros en el run padre.
    mlflow.log_params(xgb.best_params_)
    mlflow.log_metric("best_train_neg_mse", xgb.best_score_)

    mlflow.set_tags(
        tags={
            "project": "Cars",
            "optimizer_engine": "randomized_search_cv",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    mlflow.log_metric("MAE_training", metrics["MAE_training"])
    mlflow.log_metric("MAE", metrics["MAE"])
    mlflow.log_metric("RMSE", metrics["RMSE"])
    mlflow.log_metric("MAPE", metrics["MAPE"])
    mlflow.log_metric("R2", metrics["R2"])

    # Guardamos el artefacto del modelo
    artifact_path = "model"

    signature = infer_signature(X_train, xgb_best_model.predict(X_train))

    mlflow.xgboost.log_model(
        xgb_model=xgb_best_model,
        artifact_path=artifact_path,
        signature=signature,
        registered_model_name="cars_model_dev",
        input_example=X_train.head(),
        metadata={"model_data_version": 1},
        extra_pip_requirements=["xgboost==2.1.2"]
    )

    # Obtenemos la ubicación del modelo guardado en MLFlow
    model_uri = mlflow.get_artifact_uri(artifact_path)

Successfully registered model 'cars_model_dev'.
2024/12/09 20:36:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cars_model_dev, version 1
Created version '1' of model 'cars_model_dev'.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/12/09 20:36:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run best_hyperparam_2024/12/09-19:30:18" at: http://localhost:5002/#/experiments/1/runs/5ba08ae1e92041eba3eeb5251d9c6333.
2024/12/09 20:36:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5002/#/experiments/1.


In [31]:
model_uri

's3://mlflow/1/5ba08ae1e92041eba3eeb5251d9c6333/artifacts/model'

In [33]:
loaded = mlflow.xgboost.load_model(model_uri)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [35]:
y_pred_log_loaded = loaded.predict(X_test)

In [36]:
y_pred_loaded = np.expm1(y_pred_log_loaded)
mean_absolute_error(y_test_recovered, y_pred_loaded)

71630.52777798666

In [37]:
from mlflow import MlflowClient

client = MlflowClient()
name = "cars_model_prod"
desc = "This model predicts selling price for used cars"

# Creamos el modelo productivo
client.create_registered_model(name=name, description=desc)

# Guardamos como tag los hiper-parametros en la version del modelo
tags = model.get_params()
tags["model"] = type(model).__name__
tags["mae_training"] = metrics["MAE_training"]
tags["mae"] = metrics["MAE"]
tags["rmse"] = metrics["RMSE"]
tags["mape"] = metrics["MAPE"]
tags["r2"] = metrics["R2"]

# Guardamos la version del modelo
result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

# Y creamos como la version con el alias de champion para poder levantarlo en nuestro
# proceso de servicio del modelo on-line.
client.set_registered_model_alias(name, "champion", result.version)

2024/12/09 20:56:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cars_model_prod, version 1
