In [41]:
import pandas as pd
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn import ensemble
import mlflow

In [42]:
def load_data():
    filename = "bike_clean_for_training.csv"
    data = pd.read_csv(filename, sep=",")
    data.drop(['date','holiday','year','instant','casual','last_modified','registered', 'humidity'],axis=1,inplace=True)
    y = data["count"]
    x = data.copy()
    x.pop("count")

    return x, y


def make_train_test_split(x, y):

    from sklearn.model_selection import train_test_split

    (x_train, x_test, y_train, y_test) = train_test_split(
        x,
        y,
        test_size=0.25,
        random_state=123456,
    )
    return x_train, x_test, y_train, y_test


def eval_metrics(y_true, y_pred):

    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, mae, r2


def report(estimator, mse, mae, r2):

    print(estimator, ":", sep="")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")


# def log_metrics(mse, mae, r2):
#
#     import mlflow
#
#     mlflow.log_metric("mse", mse)
#     mlflow.log_metric("mae", mae)
#     mlflow.log_metric("r2", r2)


def make_pipeline(estimator):

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import MinMaxScaler

    pipeline = Pipeline(
        steps=[
            ("minMaxScaler", MinMaxScaler()),
            ("estimator", estimator),
        ],
    )

    return pipeline


def set_tracking_uri():

    import mlflow

    mlflow.set_tracking_uri('sqlite:///mlruns.db')


def display_config():

    import mlflow

    print("Current model registry uri: {}".format(mlflow.get_registry_uri()))
    print("      Current tracking uri: {}".format(mlflow.get_tracking_uri()))

In [43]:
def xgboost(model, model_name, model_type):

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    estimator = make_pipeline(
        estimator=xgb,
    )


    with mlflow.start_run(run_name="Test_for_mlflow") as run:

        print(f"MLflow run ID: {run.info.run_id}")

        estimator.fit(x_train, y_train)

        mse, mae, r2 = eval_metrics(
            y_true=y_test,
            y_pred=estimator.predict(x_test),
        )

        report(estimator, mse, mae, r2)

        #
        # Parametros y metricas
        #
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        #
        # Registro del modelo como version 1
        #
        registered_model_name = f"sklearn-{model_name}-{model_type}-model"
        
        mlflow.sklearn.log_model(
            sk_model=estimator,
            artifact_path="model",
            registered_model_name=registered_model_name
        )
    return registered_model_name


In [45]:
xgb = ensemble.GradientBoostingRegressor(learning_rate=0.01, n_estimators=1000, 
                                             max_depth=10, min_samples_split=8) 

registered_model_name = xgboost(xgb,'xg_boost','regressor')

MLflow run ID: 09e404901cf242399e955c095bb9ff53
Pipeline(steps=[('minMaxScaler', MinMaxScaler()),
                ('estimator',
                 GradientBoostingRegressor(learning_rate=0.01, max_depth=10,
                                           min_samples_split=8,
                                           n_estimators=1000))]):
  MSE: 1397.5092064967448
  MAE: 22.578807427167387
  R2: 0.9227849055355969


Registered model 'sklearn-xg_boost-regressor-model' already exists. Creating a new version of this model...
2023/03/31 06:06:21 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: sklearn-xg_boost-regressor-model, version 2
Created version '2' of model 'sklearn-xg_boost-regressor-model'.


In [46]:
#
# Carga de una versión específica del modelo
#
def call_by_version_predict(model_name,version):

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/{model_name}/{model_version}"
    )

    return model.predict(x_test[0:10])

In [47]:
model_version = 2
call_by_version_predict(registered_model_name, 2)

array([  7.62097242, 100.40322098, 138.03603528, 144.77652224,
        30.3734835 , 205.48051765, 152.91148531,  15.07277402,
       187.80595454,   0.80448538])

In [49]:
def call_by_stage_predict(model_name, stage):

    import mlflow

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/{model_name}/{stage}"
    )

    return model.predict(x_test[0:10])


In [50]:
call_by_stage_predict(registered_model_name,'Production')

array([  7.62097242, 100.40322098, 138.03603528, 144.77652224,
        30.3734835 , 205.48051765, 152.91148531,  15.07277402,
       187.80595454,   0.80448538])

In [51]:
def get_data_json_format():
    x,y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x,y)
    data = x_test.iloc[0:1,:].to_json(orient = 'split')
    data = repr(data) 
    return data

In [52]:
data = get_data_json_format()
data

'\'{"columns":["season","month","hour","weekday","day_type","weather","temperature","warming_sensation","windspeed"],"index":[1550],"data":[[1,3,5,4,1,3,0.36,0.3485,0.194]]}\''

In [53]:
!curl http://0.0.0.0:5000/invocations -H 'Content-type: application/json' -d{data}

[7.620972418735393]