## Importando librerías:

In [1]:
!rm -rf MLproject
!rm -rf ./mlruns
!rm -rf train_gbt.py

In [27]:
%%writefile train_gbt.py

#Librerías
import pandas as pd
import numpy as np
import mlflow
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import  classification_report, accuracy_score, roc_auc_score
import os
import sys
import warnings
warnings.filterwarnings("ignore")
from utils import Utils
utils = Utils()

def run():
    #
    # Entrena un modelo sklearn Gradient Boosting Trees...
    #
    # Cargando datos...

    # Cargando datos...
    path = os.path.dirname(__file__)
    filename = os.path.join(path, 'processed_data_.csv')
    data=utils.load_data(path=filename)
    data = data.dropna()

    # Partiendo variable dependientes e independientes...
    X,y = utils.features_target(data, ['is_canceled', 'lead_time', 'arrival_date_week_number',
    'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
    'babies', 
    'previous_bookings_not_canceled','year_res_status_date',
    'month_res_status_date', 'day_res_status_date', 'hotel_City Hotel',
    'hotel_Resort Hotel', 'meal_BB', 'meal_FB', 'meal_HB', 'meal_SC',
    'meal_Undefined', 'market_segment_Aviation',
    'market_segment_Complementary', 'market_segment_Corporate',
    'market_segment_Direct', 'market_segment_Groups',
    'market_segment_Offline TA/TO', 'market_segment_Online TA',
    'market_segment_Undefined', 'distribution_channel_Corporate',
    'distribution_channel_Direct', 'distribution_channel_GDS',
    'distribution_channel_TA/TO', 'distribution_channel_Undefined',
    'reserved_room_type_A', 'reserved_room_type_B', 'reserved_room_type_C',
    'reserved_room_type_D', 'reserved_room_type_E', 'reserved_room_type_F',
    'reserved_room_type_G', 'reserved_room_type_H', 'reserved_room_type_L',
    'reserved_room_type_P', 'deposit_type_No Deposit',
    'deposit_type_Non Refund', 'deposit_type_Refundable',
    'customer_type_Contract', 'customer_type_Group',
    'customer_type_Transient', 'customer_type_Transient-Party',
    'reservation_status_processed'], ["is_canceled"])
    
    # Particionamiento entrenamiento y validación...
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

    # Definiendo hiperparámetros...
    learning_rate = float(sys.argv[1])
    n_estimators = int(sys.argv[2])
    max_depht = int(sys.argv[3])
    verbose = int(sys.argv[4])


    print('Tracking directory:', mlflow.get_tracking_uri())

    with mlflow.start_run():

        estimator = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depht)
        estimator.fit(X_train, y_train)
        accuracy, recall, roc_score = utils.eval_metrics(y_test, y_pred=estimator.predict(X_test))
        if verbose > 0:
            utils.report(estimator, accuracy, recall, roc_score)

            mlflow.log_param("learning_rate", learning_rate)
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("max_depth", max_depht)

            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("roc_score", roc_score)

            mlflow.sklearn.log_model(sk_model=estimator, artifact_path="model", registered_model_name=f"sklearn-{n_estimators}-GradientBoostingTrees")

        # -------------------------------------------------------------------------------
        # evaluación del modelo
        #
        eval_data = X_test
        eval_data['target'] = y_test

        # mlflow.sklearn.log_model(estimator, "model")
        model_info = mlflow.sklearn.log_model(sk_model=estimator, artifact_path="model", registered_model_name=f"sklearn-{n_estimators}-GradientBoostingTrees")
        mlflow.evaluate(
            model_info.model_uri,
            eval_data,
            targets="target",
            model_type="classifier" # "regressor" | "classifier"
        )


        def predict():
            
            model_name = f"sklearn-400-GradientBoostingTrees"
            stage = 'Production'

            model = mlflow.pyfunc.load_model(
                model_uri=f"models:/{model_name}/{stage}"
            )

            return model.predict(X_test[0:30])


if __name__ == "__main__":
    run()

Overwriting train_gbt.py


In [28]:
%%writefile MLproject
name: Proyecto Bookings

entry_points:
    main:
        parameters:
            learning_rate: {type: float, default: 0.001}
            n_estimators: {type: int, default: 300}
            max_depth: {type: int, default: 5}
            verbose: {type: int, default: 1}
        command: 'python train_gbt.py {learning_rate} {n_estimators} {max_depth} {verbose}'

Overwriting MLproject


In [29]:
#
# Ejecución con parámetros por defecto
#
!mlflow run --env-manager=local . 

Tracking directory: file:///c:/Users/jdbul/OneDrive/Escritorio/Especializacion-Analitica/Asignaturas/ProductosDatos/DataProducts/Producto_datos/MlFlow/mlruns
GradientBoostingClassifier(learning_rate=0.001, max_depth=5, n_estimators=300):
  Accuracy: 0.670952341067091
  Recall: 0.11957375210319686
  ROC Score: 0.5595195499806234


2023/03/26 23:25:52 INFO mlflow.projects.utils: === Created directory C:\Users\jdbul\AppData\Local\Temp\tmp8drk1piz for downloading remote URIs passed to arguments of type 'path' ===
2023/03/26 23:25:52 INFO mlflow.projects.backend.local: === Running command 'python train_gbt.py 0.001 300 5 1' in run with ID 'b7643600cddb4cb08c4c7d25dcc57cff' === 
Successfully registered model 'sklearn-300-GradientBoostingTrees'.
2023/03/26 23:26:25 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: sklearn-300-GradientBoostingTrees, version 1
Created version '1' of model 'sklearn-300-GradientBoostingTrees'.
Registered model 'sklearn-300-GradientBoostingTrees' already exists. Creating a new version of this model...
2023/03/26 23:26:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: sklearn-300-GradientBoostingTrees, versio

In [30]:
!mlflow run --env-manager=local . -P learning_rate=0.1 -P n_estimators=400 -P max_depth=10 -P verbose=1

Tracking directory: file:///c:/Users/jdbul/OneDrive/Escritorio/Especializacion-Analitica/Asignaturas/ProductosDatos/DataProducts/Producto_datos/MlFlow/mlruns
GradientBoostingClassifier(max_depth=10, n_estimators=400):
  Accuracy: 0.7407236787000586
  Recall: 0.5621985417835109
  ROC Score: 0.704644014592885


2023/03/26 23:27:31 INFO mlflow.projects.utils: === Created directory C:\Users\jdbul\AppData\Local\Temp\tmpk6w2w6pv for downloading remote URIs passed to arguments of type 'path' ===
2023/03/26 23:27:31 INFO mlflow.projects.backend.local: === Running command 'python train_gbt.py 0.1 400 10 1' in run with ID '46043091227a4f1dabcd2cd2eaff61cf' === 
Successfully registered model 'sklearn-400-GradientBoostingTrees'.
2023/03/26 23:28:46 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: sklearn-400-GradientBoostingTrees, version 1
Created version '1' of model 'sklearn-400-GradientBoostingTrees'.
Registered model 'sklearn-400-GradientBoostingTrees' already exists. Creating a new version of this model...
2023/03/26 23:28:51 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: sklearn-400-GradientBoostingTrees, version

## Predicciones

In [26]:
#Librerías
import pandas as pd
import numpy as np
import mlflow
from sklearn.model_selection import train_test_split
import os
import warnings
warnings.filterwarnings("ignore")
from utils import Utils
utils = Utils()



path = os.path.dirname("__file__")
filename = os.path.join(path, 'processed_data_.csv')
data=utils.load_data(path=filename)
data = data.dropna()

# Partiendo variable dependientes e independientes...
X,y = utils.features_target(data, ['is_canceled', 'lead_time', 'arrival_date_week_number',
'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
'babies', 
'previous_bookings_not_canceled','year_res_status_date',
'month_res_status_date', 'day_res_status_date', 'hotel_City Hotel',
'hotel_Resort Hotel', 'meal_BB', 'meal_FB', 'meal_HB', 'meal_SC',
'meal_Undefined', 'market_segment_Aviation',
'market_segment_Complementary', 'market_segment_Corporate',
'market_segment_Direct', 'market_segment_Groups',
'market_segment_Offline TA/TO', 'market_segment_Online TA',
'market_segment_Undefined', 'distribution_channel_Corporate',
'distribution_channel_Direct', 'distribution_channel_GDS',
'distribution_channel_TA/TO', 'distribution_channel_Undefined',
'reserved_room_type_A', 'reserved_room_type_B', 'reserved_room_type_C',
'reserved_room_type_D', 'reserved_room_type_E', 'reserved_room_type_F',
'reserved_room_type_G', 'reserved_room_type_H', 'reserved_room_type_L',
'reserved_room_type_P', 'deposit_type_No Deposit',
'deposit_type_Non Refund', 'deposit_type_Refundable',
'customer_type_Contract', 'customer_type_Group',
'customer_type_Transient', 'customer_type_Transient-Party',
'reservation_status_processed'], ["is_canceled"])

# Particionamiento entrenamiento y validación...
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

[0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0

In [36]:
def change_name():
    import mlflow
    client = mlflow.tracking.MlflowClient()

    client.transition_model_version_stage(
        name="sklearn-400-GradientBoostingTrees",
        version=2,
        stage="Production",
    )
def predict():
            
            model_name = f"sklearn-400-GradientBoostingTrees"
            stage = 'Production'

            model = mlflow.pyfunc.load_model(
                model_uri=f"models:/{model_name}/{stage}"
            )

            return model.predict(X_test[0:200])
change_name()
predict()

array([0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
       0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1.,
       1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
       0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.])