# NYC Taxi Trip Duration Prediction

This notebook demonstrates an end-to-end machine learning workflow for predicting taxi trip durations using NYC Yellow Taxi data. The workflow includes:

1. Data loading and preprocessing of NYC taxi trip data
2. Feature engineering and data cleaning
3. Training multiple regression models (Linear Regression, XGBoost)
4. Model evaluation using metrics like RMSE and R²
5. Model tracking and versioning with MLflow
6. Model registration and deployment preparation

The notebook showcases MLOps best practices including experiment tracking, model versioning, and reproducible workflows.


In [12]:
import pandas as pd # working with tabular data
import pickle # for machine learning models
import seaborn as sns # visualization
import matplotlib.pyplot as plt # visualization

from sklearn.feature_extraction import DictVectorizer # Machine Learning
from sklearn.linear_model import LinearRegression # Machine Learning
from sklearn.linear_model import Lasso # Regularization
from sklearn.linear_model import Ridge # Regularization

from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error # Loss Function
from mlflow.models import infer_signature

In [2]:
import mlflow

# to hook up with MLFlow UI
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mlops_nyc_taxi") # choose a name for your experiment

<Experiment: artifact_location='/home/chogerlate/Documents/github/cpe393/cpe393-mlflow/mlruns/1', creation_time=1743007869912, experiment_id='1', last_update_time=1743007869912, lifecycle_stage='active', name='mlops_nyc_taxi', tags={}>

In [3]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet




7[1A[1G[27G[Files: 0  Bytes: 0  [0 B/s] Re]87[2A[1G[27G[https://d37ci6vzurychx.cloudfr]87[3A[1G[27G[https://d37ci6vzurychx.cloudfr]87[1S[4A[1G[0JSaving 'yellow_tripdata_2022-01.parquet.1'
87[1S[4A[1G[0JSaving 'yellow_tripdata_2022-02.parquet.1'

In [3]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [4]:
df_train = read_dataframe('yellow_tripdata_2022-01.parquet')
df_val = read_dataframe('yellow_tripdata_2022-02.parquet')

In [5]:
len(df_train), len(df_val)

(2421440, 2918187)

In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
with mlflow.start_run():
    mlflow.set_tag("model", "linear_regression")
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    
    mlflow.log_metric("rmse", rmse)
    
    signature = infer_signature(X_val, y_pred)
    
    mlflow.sklearn.log_model(
        lr, 
        artifact_path="models_mlflow",
        signature=signature,
        input_example=X_val[:5],
        registered_model_name="linear-regression-model"
    )
    
    with open("preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    
    mlflow.log_artifact("preprocessor.b", artifact_path="preprocessor")
    
    with open('./lin_reg.bin', 'wb') as f_out:
        pickle.dump((dv, lr), f_out)
    
    mlflow.log_artifact("lin_reg.bin", artifact_path="models_pickle")

Successfully registered model 'linear-regression-model'.
Created version '1' of model 'linear-regression-model'.


In [15]:
# import required modules
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials # some methods to optimize hyperparameters
from hyperopt.pyll import scope

In [18]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, -0.5),
    'reg_alpha': hp.loguniform('reg_alpha', -5, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -5, 0),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 2),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'objective': 'reg:squarederror',
    'seed': 42
}

def objective(params):
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        
        train = xgb.DMatrix(X_train, label=y_train)
        valid = xgb.DMatrix(X_val, label=y_val)
        
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=50,
            evals=[(valid, 'validation')],
            early_stopping_rounds=5,
            verbose_eval=False
        )
        
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        return {'loss': rmse, 'status': STATUS_OK, 'model': booster}

trials = Trials()
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=3,
    trials=trials
)

best_params = {
    'learning_rate': float(best_result['learning_rate']),
    'max_depth': int(best_result['max_depth']),
    'min_child_weight': float(best_result['min_child_weight']),
    'reg_alpha': float(best_result['reg_alpha']),
    'reg_lambda': float(best_result['reg_lambda']),
    'objective': 'reg:squarederror',
    'seed': 42
}

print(best_params)

with mlflow.start_run():
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)
    
    mlflow.log_params(best_params)
    
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=5
    )
    
    y_pred = booster.predict(valid)
    signature = infer_signature(X_val, y_pred)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    with open("preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    
    mlflow.log_artifact("preprocessor.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow", signature=signature)


100%|██████████| 3/3 [04:37<00:00, 92.56s/trial, best loss: 5.255648033980309]
{'learning_rate': 0.26209557267353656, 'max_depth': 7, 'min_child_weight': 1.0521494072255497, 'reg_alpha': 0.02283373056043026, 'reg_lambda': 0.8489942403027806, 'objective': 'reg:squarederror', 'seed': 42}
[0]	validation-rmse:8.14535
[1]	validation-rmse:7.11355
[2]	validation-rmse:6.44971
[3]	validation-rmse:6.03123
[4]	validation-rmse:5.77029
[5]	validation-rmse:5.60739
[6]	validation-rmse:5.50314
[7]	validation-rmse:5.43633
[8]	validation-rmse:5.39175
[9]	validation-rmse:5.36122
[10]	validation-rmse:5.33969
[11]	validation-rmse:5.32524
[12]	validation-rmse:5.31472
[13]	validation-rmse:5.30596
[14]	validation-rmse:5.30016
[15]	validation-rmse:5.29501
[16]	validation-rmse:5.29353
[17]	validation-rmse:5.29237
[18]	validation-rmse:5.29112
[19]	validation-rmse:5.28758
[20]	validation-rmse:5.28616
[21]	validation-rmse:5.28510
[22]	validation-rmse:5.28384
[23]	validation-rmse:5.28050
[24]	validation-rmse:5.2795

  xgb_model.save_model(model_data_path)
