# Baseline model for batch monitoring example


Install packages


In [5]:
!uv pip install -q \
    mlflow==3.7.0 \
    python-dotenv==1.2.1 \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1 \
    tqdm==4.67.1 \
    xgboost==3.1.2 \
    pyarrow==22.0.0 \
    evidently==0.7.20 \
    joblib==1.5.3 \
    prefect==3.6.11 \
    psycopg==3.3.2 \
    psycopg-binary==3.3.2 \
    requests==2.32.5 \
    tqdm==4.67.1


Append notebooks directory to sys.path


In [None]:
import sys

sys.path.append("../../..")

Import packages


In [None]:
import datetime
import pathlib

import pandas as pd
import requests
from joblib import dump, load
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from tqdm import tqdm

Create data directory


In [None]:
BASE_PATH = pathlib.Path("../../machine-learning")
DATA_DIR = BASE_PATH / "data/taxi-trip-duration"
OUTPUT_DIR = BASE_PATH / "artifacts/taxi-trip-duration"

DATA_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)

Download datasets


In [18]:
files = [
    ("green_tripdata_2022-02.parquet", DATA_DIR),
    ("green_tripdata_2022-01.parquet", DATA_DIR),
]

for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    response = requests.get(url, stream=True)
    save_path = f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(
            response.iter_content(),
            desc=f"{file}",
            postfix=f"save to {save_path}",
            total=int(response.headers["Content-Length"]),
        ):
            handle.write(data)

green_tripdata_2022-02.parquet: 100%|██████████| 1428262/1428262 [00:26<00:00, 54304.28it/s, save to ../../machine-learning/data/taxi-trip-duration/green_tripdata_2022-02.parquet]
green_tripdata_2022-01.parquet: 100%|██████████| 1254291/1254291 [00:22<00:00, 55353.55it/s, save to ../../machine-learning/data/taxi-trip-duration/green_tripdata_2022-01.parquet]
