In [6]:
import pandas as pd
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn import ensemble
import mlflow

In [1]:
def load_data():
    filename = "bike_clean_for_training.csv"
    data = pd.read_csv(filename, sep=",")
    data.drop(['date','holiday','year','instant','casual','last_modified','registered', 'humidity'],axis=1,inplace=True)
    y = data["count"]
    x = data.copy()    
    x.pop("count")

    return x, y


def make_train_test_split(x, y):

    from sklearn.model_selection import train_test_split

    (x_train, x_test, y_train, y_test) = train_test_split(
        x,
        y,
        test_size=0.25,
        random_state=123456,
    )
    return x_train, x_test, y_train, y_test


def eval_metrics(y_true, y_pred):

    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, mae, r2


def report(estimator, mse, mae, r2):

    print(estimator, ":", sep="")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")


# def log_metrics(mse, mae, r2):
#
#     import mlflow
#
#     mlflow.log_metric("mse", mse)
#     mlflow.log_metric("mae", mae)
#     mlflow.log_metric("r2", r2)


def make_pipeline(estimator):

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import MinMaxScaler

    pipeline = Pipeline(
        steps=[
            ("minMaxScaler", MinMaxScaler()),
            ("estimator", estimator),
        ],
    )

    return pipeline


def set_tracking_uri():

    import mlflow

    mlflow.set_tracking_uri('sqlite:///mlruns.db')


def display_config():

    import mlflow

    print("Current model registry uri: {}".format(mlflow.get_registry_uri()))
    print("      Current tracking uri: {}".format(mlflow.get_tracking_uri()))

In [2]:
def xgboost(model, model_name, model_type):

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    estimator = make_pipeline(
        estimator=xgb,
    )


    with mlflow.start_run(run_name="Test_for_mlflow") as run:

        print(f"MLflow run ID: {run.info.run_id}")

        estimator.fit(x_train, y_train)

        mse, mae, r2 = eval_metrics(
            y_true=y_test,
            y_pred=estimator.predict(x_test),
        )

        report(estimator, mse, mae, r2)

        #
        # Parametros y metricas
        #
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        #
        # Registro del modelo como version 1
        #
        registered_model_name = f"sklearn-{model_name}-{model_type}-model"
        
        mlflow.sklearn.log_model(
            sk_model=estimator,
            artifact_path="model",
            registered_model_name=registered_model_name
        )
    return registered_model_name


In [7]:
set_tracking_uri()
xgb = ensemble.GradientBoostingRegressor(learning_rate=0.01, n_estimators=1000, 
                                             max_depth=10, min_samples_split=8) 

registered_model_name = xgboost(xgb,'xg_boost','regressor')

2023/03/31 19:25:05 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/03/31 19:25:05 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

MLflow run ID: 802b1a97494741eb9d0ce82cce453018
Pipeline(steps=[('minMaxScaler', MinMaxScaler()),
                ('estimator',
                 GradientBoostingRegressor(learning_rate=0.01, max_depth=10,
                                           min_samples_split=8,
                                           n_estimators=1000))]):
  MSE: 1399.8277017801327
  MAE: 22.595607288254616
  R2: 0.92265680417391


Successfully registered model 'sklearn-xg_boost-regressor-model'.
2023/03/31 19:25:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: sklearn-xg_boost-regressor-model, version 1
Created version '1' of model 'sklearn-xg_boost-regressor-model'.


In [8]:
#
# Carga de una versión específica del modelo
#
def call_by_version_predict(model_name,version):

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/{model_name}/{model_version}"
    )

    return model.predict(x_test[0:10])

In [10]:
model_version = 1
call_by_version_predict(registered_model_name, model_version)

array([7.65610851e+00, 1.00403221e+02, 1.37794170e+02, 1.44776522e+02,
       3.03734835e+01, 2.05480518e+02, 1.52911485e+02, 1.50727740e+01,
       1.87805955e+02, 7.52164207e-02])

In [11]:
def call_by_stage_predict(model_name, stage):

    import mlflow

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/{model_name}/{stage}"
    )

    return model.predict(x_test[0:10])


In [12]:
call_by_stage_predict(registered_model_name,'Production')

array([7.65610851e+00, 1.00403221e+02, 1.37794170e+02, 1.44776522e+02,
       3.03734835e+01, 2.05480518e+02, 1.52911485e+02, 1.50727740e+01,
       1.87805955e+02, 7.52164207e-02])

In [15]:
def get_data_json_format():
    x,y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x,y)
    data = x_test.iloc[0:20,:].to_json(orient = 'split')
    data = repr(data) 
    return data

In [16]:
data = get_data_json_format()
data

'\'{"columns":["season","month","hour","weekday","day_type","weather","temperature","warming_sensation","windspeed"],"index":[1550,7011,6016,398,7739,5179,4019,4248,8446,28,2622,6993,4436,1898,1857,2504,1776,1831,3918,2962],"data":[[1,3,5,4,1,3,0.36,0.3485,0.194],[4,10,22,1,1,1,0.48,0.4697,0.1642],[3,9,10,2,1,1,0.68,0.6364,0.1343],[1,1,18,2,1,2,0.22,0.2273,0.1642],[4,11,6,4,0,1,0.3,0.3182,0.0896],[3,8,21,1,1,1,0.76,0.7121,0.0896],[3,6,13,2,1,1,0.74,0.697,0.1642],[3,7,2,5,1,1,0.62,0.6061,0.0],[1,12,18,5,1,1,0.36,0.3333,0.2537],[1,1,8,0,0,3,0.4,0.4091,0.2239],[2,4,8,0,0,1,0.52,0.5,0.2239],[4,10,4,1,1,1,0.4,0.4091,0.0],[3,7,22,5,1,2,0.66,0.6061,0.194],[2,3,1,5,1,1,0.2,0.2121,0.1642],[2,3,8,3,1,2,0.32,0.3182,0.194],[2,4,10,2,1,2,0.5,0.4848,0.2239],[1,3,21,6,0,1,0.4,0.4091,0.2985],[2,3,5,2,1,2,0.4,0.4091,0.1343],[2,6,8,5,1,2,0.6,0.5606,0.0],[2,5,12,0,0,1,0.6,0.6212,0.0896]]}\''

In [17]:
!curl http://0.0.0.0:5000/invocations -H 'Content-type: application/json' -d{data}

[7.656108514238809, 100.40322098398147, 137.79417023039133, 144.7765222423391, 30.373483496299517, 205.48051764902056, 152.91148531346562, 15.072774019817802, 187.80595453827655, 0.07521642072032536, 94.56454169856332, 6.156448841015317, 185.67010487103957, 6.524058178438349, 242.5449982190584, 89.22416991297806, 78.48855132334272, 11.350267791321354, 414.92160605271107, 400.9094632047867]