In [2]:
!python -V

Python 3.9.24


In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error
import pickle
import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from pathlib import Path

  import pkg_resources


In [4]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2025/10/27 18:41:29 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/27 18:41:29 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/mlops-zoomcamp/03-/mlruns/1', creation_time=1761580846153, experiment_id='1', last_update_time=1761580846153, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [5]:
def read_dataframe(filename):
    
    df = pd.read_parquet(filename)

    # Calculate trip duration in minutes

    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60) 

    # Keep only trips that at least 1 minute and at most 60

    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    # Reduce dataset for perfomance. The model won't require so many data points

    df_sample = df.sample(frac=0.1, random_state=2)
    df_sample.shape

    # Features for modeling

    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    df_sample[categorical] = df_sample[categorical].astype(str)
    
    df_sample['PU_DO'] = df_sample['PULocationID'] + "_" + df_sample['DOLocationID']
    
    return df_sample

In [6]:
# df_train = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet")
# df_val = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet")

df_train = pd.read_parquet("../02-experiment-tracking/data/yellow_tripdata_2024-01.parquet")
df_val = pd.read_parquet("../02-experiment-tracking/data/yellow_tripdata_2024-02.parquet")

In [7]:
categorical = ['PU_DO'] # ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [8]:
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

In [9]:
with mlflow.start_run(run_name="best xgboost with preprocessor"):

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
    "max_depth": 54,
    "learning_rate": 0.838560298111564,
    "reg_alpha": 0.07217623016548881,
    "reg_lambda": 0.0038449610359954913,
    "min_child_weight": 9.039435151170704,
    "objective": "reg:linear",
    "seed": 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
    params=best_params,
    dtrain=train,
    num_boost_round=30,
    evals=[(valid, "validation")],
    early_stopping_rounds=50)

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, name="models_mlflow")



[0]	validation-rmse:5.61749
[1]	validation-rmse:5.34707
[2]	validation-rmse:5.30852
[3]	validation-rmse:5.29391
[4]	validation-rmse:5.28724
[5]	validation-rmse:5.28285
[6]	validation-rmse:5.27766
[7]	validation-rmse:5.27338
[8]	validation-rmse:5.26896
[9]	validation-rmse:5.26424
[10]	validation-rmse:5.25982
[11]	validation-rmse:5.25585
[12]	validation-rmse:5.25150
[13]	validation-rmse:5.24819
[14]	validation-rmse:5.24525
[15]	validation-rmse:5.24189
[16]	validation-rmse:5.23422
[17]	validation-rmse:5.23044
[18]	validation-rmse:5.22809
[19]	validation-rmse:5.22604
[20]	validation-rmse:5.22313
[21]	validation-rmse:5.22146
[22]	validation-rmse:5.21987
[23]	validation-rmse:5.21504
[24]	validation-rmse:5.21372
[25]	validation-rmse:5.21217
[26]	validation-rmse:5.21133
[27]	validation-rmse:5.21046
[28]	validation-rmse:5.20968
[29]	validation-rmse:5.20916




In [None]:
with mlflow.start_run(run_name="best xgboost with preprocessor") as run:

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
    "max_depth": 54,
    "learning_rate": 0.838560298111564,
    "reg_alpha": 0.07217623016548881,
    "reg_lambda": 0.0038449610359954913,
    "min_child_weight": 9.039435151170704,
    "objective": "reg:linear",
    "seed": 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
    params=best_params,
    dtrain=train,
    num_boost_round=30,
    evals=[(valid, "validation")],
    early_stopping_rounds=50)

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, name="models_mlflow")

    print(run.info.run_id)