### Import the libraries

In [9]:
# Data manipulation
import pandas as pd
import numpy as np
import os

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Utilization
from pprint import pprint

# Modelling
import mlflow
from mlflow import MlflowClient
import pickle

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error

import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Set up experiment tracking with MLFlow

In [2]:
# Store experiment files on the database
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Create a new experiment
mlflow.set_experiment("duration-trip-autolog")

<Experiment: artifact_location=('/Users/farelyue/Documents/Projects/Data '
 'Science/mlops-zoomcamp/02-experiment-tracking/mlruns/2'), creation_time=1728400989576, experiment_id='2', last_update_time=1728400989576, lifecycle_stage='active', name='duration-trip-autolog', tags={}>

### Helper functions

In [4]:
# Transform nyc-taxi dataset
def read_dataframe(file_path):

    # Load csv dataset
    df = pd.read_parquet(file_path)

    # Convert pick up and drop off location into string format
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    # Convert pick up and drop off time column into datetime format
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

    # Add duration column, differences between pick up and drop off time
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    
    # Convert duration column into minute format
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # Filter duration between 1 and 60 minutes
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    return df

# Fetch logged params, metrics, and model
def fetch_logged_data(run_id):
    client = MlflowClient()
    data = client.get_run(run_id).data

    params = data.params
    metrics = data.metrics
    tags = {k:v for k, v in data.tags.items() if not k.startswith('mlflow.')}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]

    return params, metrics, tags, artifacts

In [5]:
# Load data train and validation
df_train_path = 'data/green_tripdata_2021-01.parquet'
df_val_path = 'data/green_tripdata_2021-02.parquet'

df_train = read_dataframe(df_train_path)
df_val = read_dataframe(df_val_path)

print(f'Dimension of data train : {df_train.shape[0]}, {df_train.shape[1]}')
print(f'Dimension of data validation : {df_val.shape[0]}, {df_val.shape[1]}')

Dimension of data train : 73908, 21
Dimension of data validation : 61921, 21


### Data preparation

Feature engineering experimentation

In [6]:
# Feature engineering to concatenate pick up and drop off location id
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

Preparation for data train and data validation

In [7]:
# Define categorical and numerical columns
categorical = ['PU_DO']
numerical = ['trip_distance']

# Instantiate dictionary vectorizer
dv = DictVectorizer()

# Fit and transform data train into vectors with feature-value format
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# Transform data validation into vectors based on pattern got from data train
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Define target column for predicting
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

Save DictVectorizer preprocessor

In [10]:
# Create models folder
os.makedirs('models', exist_ok=True)

# Save DictVectorizer preprocessor object into binary format for later use
with open('models/preprocessor.b', 'wb') as f_out:
    pickle.dump(dv, f_out)

### Modelling with MLflow autolog

In [12]:
# Define the random_state
RANDOM_STATE = 18

1. Random Forest

In [14]:
# Enable autologging
mlflow.sklearn.autolog(log_datasets=False)

with mlflow.start_run() as run:

    # Set tag of developer
    mlflow.set_tag('developer','farelyue')

    # Log batch of params
    params = {
        'train-data-path':'./data/green_tripdata_2021-01.parquet',
        'validation-data-path':'./data/green_tripdata_2021-02.parquet'
    }
    mlflow.log_params(params)
    
    # Log artifacts
    mlflow.log_artifact(local_path='models/preprocessor.b', artifact_path='preprocessor')

    # Train the model
    rf = RandomForestRegressor(random_state=RANDOM_STATE)
    rf.fit(X_train, y_train)
    
    # Predict on the validation dataset
    y_pred = rf.predict(X_val)

    # Evaluate model and log the metrics
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metrics({'rmse':rmse})
    
    print({'Run ID':run.info.run_id, 'rmse':rmse})    



{'Run ID': '3982aae14ad24edca76cf903b45350d9', 'rmse': 6.914909840472828}


2. Gradient Boosting Regressor

In [15]:
# Enable autologging
mlflow.sklearn.autolog(log_datasets=False)

with mlflow.start_run() as run:

    # Set tag of developer
    mlflow.set_tag('developer','farelyue')

    # Log batch of params
    params = {
        'train-data-path':'./data/green_tripdata_2021-01.parquet',
        'validation-data-path':'./data/green_tripdata_2021-02.parquet'
    }
    mlflow.log_params(params)
    
    # Log artifacts
    mlflow.log_artifact(local_path='models/preprocessor.b', artifact_path='preprocessor')

    # Train the model
    gbr = GradientBoostingRegressor(random_state=RANDOM_STATE)
    gbr.fit(X_train, y_train)

    # Predict on the validation dataset
    y_pred = gbr.predict(X_val)

    # Evaluate model and log the metrics
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metrics({'rmse':rmse})
    
    print({'Run ID':run.info.run_id, 'rmse':rmse})



{'Run ID': '668dbd3cd12547428e8fd65c5b4ae3d6', 'rmse': 6.742303328497426}


3. Extra Tree Regressor

In [16]:
# Enable autologging
mlflow.sklearn.autolog(log_datasets=False)

with mlflow.start_run() as run:

    # Set tag of developer
    mlflow.set_tag('developer','farelyue')

    # Log batch of params
    params = {
        'train-data-path':'./data/green_tripdata_2021-01.parquet',
        'validation-data-path':'./data/green_tripdata_2021-02.parquet'
    }
    mlflow.log_params(params)
    
    # Log artifacts
    mlflow.log_artifact(local_path='models/preprocessor.b', artifact_path='preprocessor')

    # Train the model
    etr = ExtraTreesRegressor(random_state=RANDOM_STATE)
    etr.fit(X_train, y_train)

    # Predict on the validation dataset
    y_pred = etr.predict(X_val)

    # Evaluate model and log the metrics
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metrics({'rmse':rmse})
    
    print({'Run ID':run.info.run_id, 'rmse':rmse})



{'Run ID': '794703933ee54ea79264b8d27cea1959', 'rmse': 6.940426720605602}


4. SVM Regressor

In [17]:
# Enable autologging
mlflow.sklearn.autolog(log_datasets=False)
 
with mlflow.start_run() as run:
    
    # Set tag of developer
    mlflow.set_tag('developer','farelyue')

    # Log batch of params
    params = {
        'train-data-path':'./data/green_tripdata_2021-01.parquet',
        'validation-data-path':'./data/green_tripdata_2021-02.parquet'
    }
    mlflow.log_params(params)
    
    # Log artifacts
    mlflow.log_artifact(local_path='models/preprocessor.b', artifact_path='preprocessor')

    # Train the model
    svr = LinearSVR(random_state=RANDOM_STATE)
    svr.fit(X_train, y_train)

    # Predict on the validation dataset
    y_pred = svr.predict(X_val)

    # Evaluate model and log the metrics
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metrics({'rmse':rmse})
    
    print({'Run ID':run.info.run_id, 'rmse':rmse})



{'Run ID': '47e45a45a4e441afa61e6e23cc7624c2', 'rmse': 807.9904772805995}


5. XGBoost Regressor

In [23]:
# Enable autologging
mlflow.sklearn.autolog(log_datasets=False)

with mlflow.start_run() as run:

    # Set tag of developer
    mlflow.set_tag('developer','farelyue')
    mlflow.set_tag('estimator_name', 'XGBRegressor')

    # Log batch of params
    params = {
        'train-data-path':'./data/green_tripdata_2021-01.parquet',
        'validation-data-path':'./data/green_tripdata_2021-02.parquet'
    }
    mlflow.log_params(params)
    
    # Log artifacts
    mlflow.log_artifact(local_path='models/preprocessor.b', artifact_path='preprocessor')

    # Train the model
    xgbr = xgb.XGBRegressor(random_state=RANDOM_STATE)
    xgbr.fit(X_train, y_train)
    mlflow.log_params(xgbr.get_params())
    mlflow.sklearn.log_model(xgbr, artifact_path='model')

    # Predict on the validation dataset
    y_pred = xgbr.predict(X_val)

    # Evaluate model and log the metrics
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metrics({'rmse':rmse})
    
    print({'Run ID':run.info.run_id, 'rmse':rmse})



{'Run ID': 'b09242a9964c4965bc8baa572bc2103b', 'rmse': 6.644320289321216}
