## Import dependencies

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, root_mean_squared_error

import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import mlflow

import uuid
import warnings
warnings.filterwarnings("ignore")

## Data Handling

In [2]:
def read_and_handling(filepath):
    df = pd.read_parquet(filepath)
    # print(f"Data loaded from {filepath}...")

    # Data Preprocessing
    # print("Preprocessing data...\n")
    # print(f"Initial shape: {df.shape}")
    # print(f"Number of columns: {df.shape[1]}\n")

    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # print(f"Standard deviation of the trips duration: {round(df.duration.std(), 2)}\n")

    categorical = ['PULocationID', 'DOLocationID']
    numerical = []
    df[categorical] = df[categorical].astype(str)

    # Anomaly Handling
    # print("Handling outliers...\n")
    initial_rows = df.shape[0]
    df = df[((df.duration >= 1) & (df.duration <= 60))]
    final_rows = df.shape[0]
    
    # print(f"Initial records: {initial_rows}")
    # print(f"Final records: {final_rows}")
    # print(f"Fraction of the records after outliers handling: {100-(100*(initial_rows - final_rows)/final_rows):.2f}%\n")

    # print("----------------------\n")

    return df

## ML Model Development

### Feature Engineering

In [3]:
def feature_engineering(df_train, df_val):
    # Feature Engineering
    print("Feature engineering...\n")

    target = 'duration'
    numerical = []
    categorical = ['PULocationID', 'DOLocationID']
    mlflow.log_param("target", target)
    mlflow.log_param("numerical_columns", numerical)
    mlflow.log_param("categorical_columns", categorical)

    # One-hot encoding
    print("One-hot encoding...\n")
    
    dv = DictVectorizer()
    X_train = dv.fit_transform(df_train[categorical + numerical].to_dict(orient='records'))
    y_train = df_train[target].values
    mlflow.log_param("X_train_shape", X_train.shape)
    mlflow.log_param("y_train_shape", y_train.shape)
    mlflow.log_param("X_train_columns", X_train.shape[1])

    print(f"Number of features: {len(dv.get_feature_names_out())}")
    print(f"Training shape: {X_train.shape}")    
    print(f"Dimensionality of this matrix (number of columns): {X_train.shape[1]}\n")

    X_val = dv.transform(df_val[categorical + numerical].to_dict(orient='records'))
    y_val = df_val[target].values
    mlflow.log_param("X_val_shape", X_val.shape)
    mlflow.log_param("y_val_shape", y_val.shape)
    mlflow.log_param("X_val_columns", X_val.shape[1])
    
    print(f"Validation shape: {X_val.shape}")
    print(f"Dimensionality of this matrix (number of columns): {X_val.shape[1]}")

    return X_train, y_train, X_val, y_val, dv

### Linear Regression Model Development

In [4]:
def linearRegression_modelDev(X_train, y_train, X_val, y_val, dv):
    # Training the model
    print("Training the model...")
    mlflow.set_tag("model", "linear_regression")
    
    model = LinearRegression()
    mlflow.log_param("model_type", type(model).__name__)
    mlflow.log_param("model_name", model.__class__.__name__)
    mlflow.log_param("model_params", model.get_params())

    model.fit(X_train, y_train)
    y_test = model.predict(X_train)
    train_rmse = root_mean_squared_error(y_train, y_test)
    mlflow.log_metric("train_rmse", train_rmse)

    print(f"Training RMSE: {train_rmse:.2f}\n")

    # Model Validation
    print("Model Validation...")

    y_pred = model.predict(X_val)
    val_rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("val_rmse", val_rmse)

    print(f"Validation RMSE: {val_rmse:.2f}\n")
    
    # Save the model
    with open('models/ModelManagement_testTracked_LRmodel.bin', 'wb') as f_out:
        pickle.dump((dv, model), f_out)

    mlflow.log_artifact(local_path='models/ModelManagement_testTracked_LRmodel.bin',
                    artifact_path='models_pickle')

### XGBoost Model Development

In [5]:
def XGBoost_modelDev(X_train, y_train, X_val, y_val, dv):
    # Training the model
    print("Training the XGBoost model...\n")
    
    mlflow.set_tag("model", "XGBoost")

    # Hyperparameter tuning
    def objective(params):
        with mlflow.start_run(nested=True):
            # Tags specific to this evaluation
            mlflow.set_tag("evaluation_id", str(uuid.uuid4())[:8])  # Optional: unique ID for this evaluation
            mlflow.set_tag("evaluation_step", "hyperparameter_tuning")
            
            # Tag specific parameter ranges
            if params['learning_rate'] < 0.1:
                mlflow.set_tag("learning_rate_range", "low")
            else:
                mlflow.set_tag("learning_rate_range", "high")

            # Log parameters within this child run
            mlflow.log_params(params)
            
            booster = xgb.train(
                params = params,
                dtrain = xgb.DMatrix(X_train, label=y_train),
                num_boost_round = 1000,
                evals = [(xgb.DMatrix(X_val, label=y_val), 'validation')],
                early_stopping_rounds = 50
            )

            y_pred = booster.predict(xgb.DMatrix(X_val))
            val_rmse = root_mean_squared_error(y_val, y_pred)
            
            # Log the metric within this child run
            mlflow.log_metric("val_rmse", val_rmse)
            print(f"Validation RMSE: {val_rmse:.2f}\n")
        
        return {'loss': val_rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0), # exp(-3), exp(0) -> [0.05, 1]
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:linear',
        'seed': 42,
    }

    best = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=50,
        trials=Trials()
    )

    print(f"Best hyperparameters: {best}\n")
    mlflow.log_params({f"best_{k}": v for k, v in best.items()})

    # # Train the final model with the best hyperparameters
    # best_params = {k: int(v) if k == 'n_estimators' or k == 'max_depth' else v for k, v in best.items()}
    # model = xgb.XGBRegressor(**best_params)
    
    # model.fit(X_train, y_train)
    
    # # Model Validation
    # print("Model Validation...")

    # y_pred = model.predict(X_val)
    # val_rmse = root_mean_squared_error(y_val, y_pred)
    # mlflow.log_metric("val_rmse", val_rmse)

    # print(f"Validation RMSE: {val_rmse:.2f}\n")
    
    # # Save the model
    # with open('models/testTracked_XBGmodel.bin', 'wb') as f_out:
    #     mlflow.log_artifact('models/testTracked_XBGmodel.bin')
    #     pickle.dump((dv, model), f_out)
    # print("Model saved successfully...\n")

## Integrated All Processes

In [6]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("TrialExperimentNYC_Taxi")

with mlflow.start_run():
    mlflow.set_tag("developer", "sutiwas-jitsopak")

    mlflow.log_param("train-data-path", "../01-intro/data/yellow_tripdata_2023-01.parquet")
    mlflow.log_param("valid-data-path", "../01-intro/data/yellow_tripdata_2023-02.parquet")

    # Data Loading, Preprocessing and Anomaly Handling
    df_train = read_and_handling("../01-intro/data/yellow_tripdata_2023-01.parquet")
    df_val = read_and_handling("../01-intro/data/yellow_tripdata_2023-02.parquet")
    mlflow.log_param("train_shape", df_train.shape)
    mlflow.log_param("valid_shape", df_val.shape)
    
    # ML Model Development: Feature Engineering
    X_train, y_train, X_val, y_val, dv = feature_engineering(df_train, df_val)

    # ML Model Development: Linear Regression
    linearRegression_modelDev(X_train, y_train, X_val, y_val, dv)
    print("Linear Regression Model training and evaluation completed.\n")
    
    # ML Model Development:
    ## Hyperparameter tuning 
    # XGBoost_modelDev(X_train, y_train, X_val, y_val, dv)

    ## Final model training
    # mlflow.set_tag("model", "XGBoost")
    # mlflow.log_param("model_type", "XGBoost")

    # params = {
    #     'learning_rate': 0.9846137718216238,
    #     'max_depth': 90,
    #     'min_child_weight': 7.97363247466668,
    #     'objective': 'reg:linear',
    #     'reg_alpha': 0.05098434242156559,
    #     'reg_lambda': 0.0025084928043955963,
    #     'seed': 42,
    # }

    # mlflow.xgboost.autolog()
    # booster = xgb.train(
    #     params=params,
    #     dtrain=xgb.DMatrix(X_train, label=y_train),
    #     num_boost_round=1000,
    #     evals=[(xgb.DMatrix(X_val, label=y_val), 'validation')],
    #     early_stopping_rounds=50
    # )

    # print("XGBoost Model training and evaluation completed.\n")

Feature engineering...

One-hot encoding...

Number of features: 515
Training shape: (3009173, 515)
Dimensionality of this matrix (number of columns): 515

Validation shape: (2855951, 515)
Dimensionality of this matrix (number of columns): 515
Training the model...
Training RMSE: 7.65

Model Validation...
Validation RMSE: 7.81

Linear Regression Model training and evaluation completed.



In [8]:
import mlflow.xgboost

params = {
    'learning_rate': 0.9846137718216238,
    'max_depth': 90,
    'min_child_weight': 7.97363247466668,
    'objective': 'reg:linear',
    'reg_alpha': 0.05098434242156559,
    'reg_lambda': 0.0025084928043955963,
    'seed': 42,
    }

mlflow.log_params(params)

booster = xgb.train(
    params=params,
    dtrain=xgb.DMatrix(X_train, label=y_train),
    num_boost_round=1000,
    evals=[(xgb.DMatrix(X_val, label=y_val), 'validation')],
    early_stopping_rounds=50
)
y_pred = booster.predict(xgb.DMatrix(X_val))
rmse = root_mean_squared_error(y_val, y_pred)
mlflow.log_metric("rmse", rmse)

# Save the model
with open('models/preprocessor.b', 'wb') as f_out:
    pickle.dump(dv, f_out)

mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')

[0]	validation-rmse:6.72487
[1]	validation-rmse:5.68318
[2]	validation-rmse:5.54662
[3]	validation-rmse:5.34575
[4]	validation-rmse:5.29511
[5]	validation-rmse:5.27278
[6]	validation-rmse:5.26864
[7]	validation-rmse:5.26566
[8]	validation-rmse:5.26310
[9]	validation-rmse:5.26259
[10]	validation-rmse:5.25752
[11]	validation-rmse:5.25623
[12]	validation-rmse:5.25562
[13]	validation-rmse:5.25447
[14]	validation-rmse:5.25371
[15]	validation-rmse:5.25261
[16]	validation-rmse:5.25205
[17]	validation-rmse:5.25150
[18]	validation-rmse:5.25110
[19]	validation-rmse:5.25025
[20]	validation-rmse:5.24992
[21]	validation-rmse:5.24941
[22]	validation-rmse:5.24920
[23]	validation-rmse:5.24872
[24]	validation-rmse:5.24822
[25]	validation-rmse:5.24797
[26]	validation-rmse:5.24739
[27]	validation-rmse:5.24718
[28]	validation-rmse:5.24668
[29]	validation-rmse:5.24717
[30]	validation-rmse:5.24509
[31]	validation-rmse:5.24336
[32]	validation-rmse:5.24481
[33]	validation-rmse:5.24297
[34]	validation-rmse:5.2



<mlflow.models.model.ModelInfo at 0x29d5cd2e5a0>

In [9]:
logged_model = 'runs:/9f9ab13b5b604ae898a9868c4214cd02/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [10]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 9f9ab13b5b604ae898a9868c4214cd02

In [11]:
xgboost_model = mlflow.xgboost.load_model(logged_model)

In [12]:
xgboost_model

<xgboost.core.Booster at 0x29db372e9c0>

In [13]:
y_Pred = xgboost_model.predict(xgb.DMatrix(X_val))

In [14]:
y_Pred[:10]

array([ 6.2958965, 44.158443 , 15.938881 , 21.573524 , 25.240328 ,
        5.4894414, 20.11293  ,  5.903235 ,  6.787964 , 11.924575 ],
      dtype=float32)

In [15]:
y_pred[:10] == y_Pred[:10]
# Check if the predictions are the same

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

## Logging models in MLflow

Two Options:
- Log model as an artifact: mlflow.log_artifact("my_model", artifact_path="models/")
- Log model using the method log_model: mlflow.<model_framework>.log_model(model , artifact_path="models/)