In [1]:
import mlflow
from mlflow.tracking import MlflowClient

In [2]:
# Create a client with previous databse as uri

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# Inspect experiment stored in uri
experiments = client.search_experiments()

2025/10/24 23:35:08 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/24 23:35:08 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


In [3]:
experiments

[<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/2', creation_time=1761318364169, experiment_id='2', last_update_time=1761318364169, lifecycle_stage='active', name='my_new_experiment', tags={}>,
 <Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1761241360745, experiment_id='1', last_update_time=1761241360745, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/0', creation_time=1761241360735, experiment_id='0', last_update_time=1761241360735, lifecycle_stage='active', name='Default', tags={}>]

In [4]:
# Create a new experiment

client.create_experiment("my_new_experiment")

MlflowException: Experiment(name=my_new_experiment) already exists. Error: (raised as a result of Query-invoked autoflush; consider using a session.no_autoflush block if this flush is occurring prematurely)
(sqlite3.IntegrityError) UNIQUE constraint failed: experiments.name
[SQL: INSERT INTO experiments (name, artifact_location, lifecycle_stage, creation_time, last_update_time) VALUES (?, ?, ?, ?, ?)]
[parameters: ('my_new_experiment', None, 'active', 1761348908907, 1761348908907)]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [None]:
client.search_experiments()

[<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/2', creation_time=1761318364169, experiment_id='2', last_update_time=1761318364169, lifecycle_stage='active', name='my_new_experiment', tags={}>,
 <Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1761241360745, experiment_id='1', last_update_time=1761241360745, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/0', creation_time=1761241360735, experiment_id='0', last_update_time=1761241360735, lifecycle_stage='active', name='Default', tags={}>]

In [None]:
# Search models in a run

from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metrics.rmse ASC']
)

runs

[<Run: data=<RunData: metrics={'rmse': 5.217010818408646}, params={'learning_rate': '0.8287838928754262',
  'max_depth': '24',
  'min_child_weight': '3.11679525074948',
  'objective': 'reg:linear',
  'reg_alpha': '0.05402893714004757',
  'reg_lambda': '0.26367786361882845',
  'seed': '42'}, tags={'mlflow.runName': 'clean-squid-531',
  'mlflow.source.name': '/home/codespace/anaconda3/envs/exp-tracking-env/lib/python3.9/site-packages/ipykernel_launcher.py',
  'mlflow.source.type': 'LOCAL',
  'mlflow.user': 'codespace',
  'model': 'xgboost 100 round'}>, info=<RunInfo: artifact_uri='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1/02ebb32033bf4aa88a8011a614ad07f5/artifacts', end_time=1761257046946, experiment_id='1', lifecycle_stage='active', run_id='02ebb32033bf4aa88a8011a614ad07f5', run_name='clean-squid-531', start_time=1761257006113, status='FINISHED', user_id='codespace'>, inputs=<RunInputs: dataset_inputs=[], model_inputs=[]>, outputs=None>,
 <Run: data=<RunData: metrics={'

In [None]:
for run in runs:
    try:
        print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")
    except:
        pass

run id: 02ebb32033bf4aa88a8011a614ad07f5, rmse: 5.2170
run id: b9bb04150f624b2d995418627943950d, rmse: 5.2260
run id: c7c8860965cc44b386739e0d988eed9f, rmse: 5.2316
run id: e7490249027b49148be6ef4cd98f9bb2, rmse: 5.2528
run id: 173d7adbecfe4e28b71dd943730d216c, rmse: 5.2612


In [None]:
# Filter use

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 5.24",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metrics.rmse ASC']
)

for run in runs:
    try:
        print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")
    except:
        pass

run id: 02ebb32033bf4aa88a8011a614ad07f5, rmse: 5.2170
run id: b9bb04150f624b2d995418627943950d, rmse: 5.2260
run id: c7c8860965cc44b386739e0d988eed9f, rmse: 5.2316


In [5]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [15]:
run_id = "6d45753a75544988989ab211cb6bd026"
model_uri = f"runs:/{run_id}/models_mlflow"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '1' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1761349563278, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1761349563278, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='6d45753a75544988989ab211cb6bd026', run_link=None, source='models:/m-1389733a56af4cad8dc6139c007d5fea', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [23]:
client.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1761349020422, deployment_job_id=None, deployment_job_state=None, description=None, last_updated_timestamp=1761349563278, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1761349563278, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1761349563278, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='6d45753a75544988989ab211cb6bd026', run_link=None, source='models:/m-1389733a56af4cad8dc6139c007d5fea', status='READY', status_message=None, tags={}, user_id=None, version=1>], name='nyc-taxi-regressor', tags={}>,
 <RegisteredModel: aliases={}, creation_timestamp=1761312918801, deployment_job_id=None, deployment_job_state=None, description='', last_updated_timestamp=1761314590054, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1761312919108, current_stage='Staging', deployment_job_state=None, description='', last_updated_timestamp=1761314590054, m

In [None]:
model_name = "nyc-taxi-regressor"

latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None


  latest_versions = client.get_latest_versions(name=model_name)


In [None]:
model_version = 1
new_stage = "Staging"

client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1761349563278, current_stage='Staging', deployment_job_state=None, description=None, last_updated_timestamp=1761350154982, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='6d45753a75544988989ab211cb6bd026', run_link=None, source='models:/m-1389733a56af4cad8dc6139c007d5fea', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [34]:
from datetime import datetime

date = datetime.today().date()

client.update_model_version(
    name=model_name,
    version=1,
    description=f"Model version {model_version} was transiotined to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1761349563278, current_stage='Staging', deployment_job_state=None, description='Model version 1 was transiotined to Staging on 2025-10-25', last_updated_timestamp=1761350400160, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='6d45753a75544988989ab211cb6bd026', run_link=None, source='models:/m-1389733a56af4cad8dc6139c007d5fea', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [45]:
from sklearn.metrics import root_mean_squared_error
import pandas as pd

def read_dataframe(filename):
    
    df = pd.read_parquet(filename)

    # Calculate trip duration in minutes

    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60) 

    # Keep only trips that at least 1 minute and at most 60

    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    # Reduce dataset for perfomance. The model won't require so many data points

    df_sample = df.sample(frac=0.1, random_state=2)
    df_sample.shape

    # Features for modeling

    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    df_sample[categorical] = df_sample[categorical].astype(str)
    
    df_sample['PU_DO'] = df_sample['PULocationID'] + "_" + df_sample['DOLocationID']
    
    return df_sample

def preprocess(df, dv):
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

In [40]:
df = read_dataframe("data/yellow_tripdata_2024-03.parquet")

In [41]:
client.download_artifacts(run_id="6d45753a75544988989ab211cb6bd026", path="preprocessor", dst_path=".")

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/workspaces/mlops-zoomcamp/02-experiment-tracking/preprocessor'

In [43]:
import pickle

with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [46]:
X_test = preprocess(df, dv)

In [47]:
target = "duration"
y_test = df[target].values

In [48]:
%time test_model(name=model_name, stage=new_stage, X_test=X_test, y_test=y_test)

CPU times: user 641 ms, sys: 339 ms, total: 981 ms
Wall time: 5.58 s


{'rmse': 5.5999664144183035}

In [49]:
client.transition_model_version_stage(
    name=model_name,
    version=1,
    stage="Production",
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1761349563278, current_stage='Production', deployment_job_state=None, description='Model version 1 was transiotined to Staging on 2025-10-25', last_updated_timestamp=1761351745058, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='6d45753a75544988989ab211cb6bd026', run_link=None, source='models:/m-1389733a56af4cad8dc6139c007d5fea', status='READY', status_message=None, tags={}, user_id=None, version=1>