In [1]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-3.7.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.7.0 (from mlflow)
  Downloading mlflow_skinny-3.7.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.7.0 (from mlflow)
  Downloading mlflow_tracing-3.7.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.17.2-py3-none-any.whl.metadata (7.2 kB)
Collecting cryptography<47,>=43.0.0 (from mlflow)
  Using cached cryptography-46.0.3-cp311-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)


In [2]:
import mlflow
print('MLflow version:', mlflow.__version__)

MLflow version: 3.7.0


In [9]:
!mkdir -p data


In [11]:
pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (pyproject.toml) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9685 sha256=dc3b4a8da626796338d89f5919bc6311af0a4864fc99a20a449d2a751190017f
  Stored in directory: /home/codespace/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Note: you may need to restart the kernel to use updated packages.


In [12]:
import wget

base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"

files = [
    "green_tripdata_2023-01.parquet",
    "green_tripdata_2023-02.parquet",
    "green_tripdata_2023-03.parquet"
]

for f in files:
    url = base_url + f
    print("Downloading:", url)
    wget.download(url, out="data/")
    print()


Downloading: https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet

Downloading: https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet

Downloading: https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet



In [13]:
!python preprocess_data.py --raw_data_path data --dest_path output


Saved 4 files to: output


In [14]:
import os

print("Output folder files:")
os.listdir("output")


Output folder files:


['val.pkl', 'test.pkl', 'dv.pkl', 'train.pkl']

In [15]:
import mlflow
import mlflow.sklearn
import pickle
from pathlib import Path

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Paths
output_path = Path('./output')
train_path = output_path / 'train.pkl'
val_path = output_path / 'val.pkl'
dv_path = output_path / 'dv.pkl'

print('Loading:', train_path, val_path, dv_path)

with open(train_path, 'rb') as f:
    X_train, y_train = pickle.load(f)

with open(val_path, 'rb') as f:
    X_val, y_val = pickle.load(f)

with open(dv_path, 'rb') as f:
    dv = pickle.load(f)

print('Shapes:')
print('X_train:', X_train.shape, 'y_train:', y_train.shape)
print('X_val  :', X_val.shape, 'y_val  :', y_val.shape)

Loading: output/train.pkl output/val.pkl output/dv.pkl
Shapes:
X_train: (65946, 5702) y_train: (65946,)
X_val  : (62574, 5702) y_val  : (62574,)


In [16]:
mlflow.set_experiment('random-forest-experiment')

2025/12/10 17:32:44 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/10 17:32:44 INFO mlflow.store.db.utils: Updating database tables
2025/12/10 17:32:44 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/10 17:32:44 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/10 17:32:44 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/10 17:32:44 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/10 17:32:44 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/10 17:32:44 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/10 17:32:44 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/10 17:32:44 INFO alembic.runtime.migration: Running 

<Experiment: artifact_location='/workspaces/github_mlops_bootcamp/2-expeniment-tracking/mlruns/1', creation_time=1765387964824, experiment_id='1', last_update_time=1765387964824, lifecycle_stage='active', name='random-forest-experiment', tags={}>

In [17]:
from math import sqrt

mlflow.sklearn.autolog()

with mlflow.start_run():
    params = {
        'n_estimators': 100,
        'max_depth': 10,
        'random_state': 0,
        'min_samples_split': 2,
        'n_jobs': -1,
    }

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    rmse = sqrt(mean_squared_error(y_val, y_pred))

    # Explicit logging (autolog will also log metrics and params)
    mlflow.log_metric('rmse', rmse)

    print('Validation RMSE:', rmse)


Validation RMSE: 5.431162180141208
