In [22]:
import pickle
import mlflow
import mlflow.sklearn
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from hyperopt.pyll import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

In [2]:
# create SQLite mflow.db
mlflow.set_tracking_uri("sqlite:///mlflow.db")
# this will create experiment based on provide name
# also create mlruns folder
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/home/ubuntu/mlops-datatalk/02_experiment_tracking/mlruns/1', creation_time=1715068447646, experiment_id='1', last_update_time=1715068447646, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
# delete experiment based on its id
# mlflow.delete_experiment(experiment_id = '1')

In [4]:
# basic function
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
df_train = read_dataframe('../data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2021-02.parquet')

print(len(df_train), len(df_val))

73908 61921


In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
x_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
x_val = dv.transform(val_dicts)

target = 'duration'

y_train = df_train[target].values
y_val = df_val[target].values

#### Workflow without MLOps

In [7]:
lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_val)

root_mean_squared_error(y_val, y_pred)

with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

#### Workflow with MLOps

In [8]:
with mlflow.start_run():
    # log detail
    mlflow.set_tag("developer", "christopher")

    # log parameter
    mlflow.log_param("train-data-path", "../data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "../data/green_tripdata_2021-02.csv")

    alpha = 0.1

    # log parameter
    mlflow.log_param("alpha", alpha)

    lr = Lasso(alpha)
    lr.fit(x_train, y_train)

    y_pred = lr.predict(x_val)

    rmse = root_mean_squared_error(y_val, y_pred)

    # log rmse metric
    mlflow.log_metric("rmse", rmse)

    # save model locally
    with open('models/lin_reg.bin', 'wb') as f_out:
        pickle.dump((dv, lr), f_out)

    # log artifact for model
    mlflow.log_artifact(local_path = "models/lin_reg.bin", artifact_path = "models_pickle")

#### MLOps with XGBoost training

In [9]:
train = xgb.DMatrix(x_train, label = y_train)
valid = xgb.DMatrix(x_val, label = y_val)

In [10]:
def objective(params):
    with mlflow.start_run():
        # log detail
        mlflow.set_tag("developer", "christopher")
        # log detail
        mlflow.set_tag("model", "xgboost")
        # log parameter
        mlflow.log_params(params)

        booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, 'validation')],
            early_stopping_rounds = 50
        )

        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)

        # log metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [11]:
# search space to find the best hyperparameter
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 50,
    trials = Trials()
)

print(best_result)

[0]	validation-rmse:11.41578                          
[1]	validation-rmse:10.72489                          
[2]	validation-rmse:10.12867                          
[3]	validation-rmse:9.61566                           
[4]	validation-rmse:9.17692                           
[5]	validation-rmse:8.80164                           
[6]	validation-rmse:8.48287                           
[7]	validation-rmse:8.21328                           
[8]	validation-rmse:7.98621                           
[9]	validation-rmse:7.79534                           
[10]	validation-rmse:7.63368                          
[11]	validation-rmse:7.49811                          
[12]	validation-rmse:7.38492                          
[13]	validation-rmse:7.28947                          
[14]	validation-rmse:7.20963                          
[15]	validation-rmse:7.14121                          
[16]	validation-rmse:7.08442                          
[17]	validation-rmse:7.03639                          
[18]	valid

KeyboardInterrupt: 

In [13]:
# use best parameter to train the model
params = {
    'learning_rate': 0.6653267087715755,
    'max_depth': 20,
    'min_child_weight': 19.965812980250803,
    'objective': 'reg:squarederror',
    'reg_alpha': 0.18289424399274412,
    'reg_lambda': 0.0969372299406269,
    'seed': 42
}

# enable autolog
mlflow.xgboost.autolog(disable = False)

with mlflow.start_run():
    booster = xgb.train(
        params = params,
        dtrain = train,
        num_boost_round = 1000,
        evals = [(valid, 'validation')],
        early_stopping_rounds = 50
    )

[0]	validation-rmse:7.75328
[1]	validation-rmse:6.94322
[2]	validation-rmse:6.79259
[3]	validation-rmse:6.73933
[4]	validation-rmse:6.71455
[5]	validation-rmse:6.70595
[6]	validation-rmse:6.70304
[7]	validation-rmse:6.70092
[8]	validation-rmse:6.69706
[9]	validation-rmse:6.69438
[10]	validation-rmse:6.69379
[11]	validation-rmse:6.69064
[12]	validation-rmse:6.68937
[13]	validation-rmse:6.68716
[14]	validation-rmse:6.68538
[15]	validation-rmse:6.68556
[16]	validation-rmse:6.68103
[17]	validation-rmse:6.67992
[18]	validation-rmse:6.67886
[19]	validation-rmse:6.67828
[20]	validation-rmse:6.67755
[21]	validation-rmse:6.67640
[22]	validation-rmse:6.67589
[23]	validation-rmse:6.67511
[24]	validation-rmse:6.67438
[25]	validation-rmse:6.67411
[26]	validation-rmse:6.67320
[27]	validation-rmse:6.67253
[28]	validation-rmse:6.67236
[29]	validation-rmse:6.67170
[30]	validation-rmse:6.67115
[31]	validation-rmse:6.67068
[32]	validation-rmse:6.67050
[33]	validation-rmse:6.67022
[34]	validation-rmse:6.6



In [15]:
# disable log as want to use the best parameter to save model
mlflow.xgboost.autolog(disable = True)

with mlflow.start_run() as run:
    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    # MLflow: store detail
    mlflow.set_tag("developer", "christopher")
    # MLflow: store detail
    mlflow.set_tag("model", "xgboost")
    # MLflow: store parameter
    mlflow.log_params(best_params)

    booster = xgb.train(
        params = best_params,
        dtrain = train,
        num_boost_round = 1000,
        evals = [(valid, 'validation')],
        early_stopping_rounds = 50
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)

    # MLflow: RMSE metric
    mlflow.log_metric("rmse", rmse)

    # save preprocessor.b locally first
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    # access locally then upload to MLflow
    mlflow.log_artifact("models/preprocessor.b", artifact_path = "preprocessor")

    # save model to MLflow model path
    mlflow.xgboost.log_model(booster, artifact_path = "model")

# obtain run id
run_id = run.info.run_id

# load model from MLflow
logged_model = f'runs:/{run_id}/model'
loaded_model = mlflow.pyfunc.load_model(logged_model)
xgboost_model = mlflow.xgboost.load_model(logged_model)

y_pred = xgboost_model.predict(valid)
y_pred

[0]	validation-rmse:11.44482
[1]	validation-rmse:10.77202
[2]	validation-rmse:10.18363
[3]	validation-rmse:9.67396
[4]	validation-rmse:9.23166
[5]	validation-rmse:8.84808
[6]	validation-rmse:8.51883
[7]	validation-rmse:8.23597
[8]	validation-rmse:7.99320
[9]	validation-rmse:7.78709
[10]	validation-rmse:7.61022
[11]	validation-rmse:7.45952
[12]	validation-rmse:7.33049
[13]	validation-rmse:7.22098
[14]	validation-rmse:7.12713
[15]	validation-rmse:7.04752
[16]	validation-rmse:6.98005
[17]	validation-rmse:6.92232
[18]	validation-rmse:6.87112
[19]	validation-rmse:6.82740
[20]	validation-rmse:6.78995
[21]	validation-rmse:6.75792
[22]	validation-rmse:6.72994
[23]	validation-rmse:6.70547
[24]	validation-rmse:6.68390
[25]	validation-rmse:6.66421
[26]	validation-rmse:6.64806
[27]	validation-rmse:6.63280
[28]	validation-rmse:6.61924
[29]	validation-rmse:6.60773
[30]	validation-rmse:6.59777
[31]	validation-rmse:6.58875
[32]	validation-rmse:6.58107
[33]	validation-rmse:6.57217
[34]	validation-rmse:



array([14.782765 ,  7.184751 , 15.971323 , ..., 13.464008 ,  6.7352147,
        8.184539 ], dtype=float32)

In [24]:
mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):
    with mlflow.start_run():
        # log detail
        mlflow.set_tag("developer", "christopher")
        # log parameter
        mlflow.log_param("train-data-path", "../data/green_tripdata_2021-01.parquet")
        mlflow.log_param("valid-data-path", "../data/green_tripdata_2021-02.parquet")

        # log artifact for preprocessor
        mlflow.log_artifact("models/preprocessor.b", artifact_path = "preprocessor")

        super_model = model_class()
        super_model.fit(x_train, y_train)

        y_pred = super_model.predict(x_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # log metric
        mlflow.log_metric("rmse", rmse)

KeyboardInterrupt: 