# Unit 2: Experiment Tracking Homework

In [11]:
# imports 
import os

## Q1. Install MLflow

In [1]:
import mlflow
print(f"MLflow version: {mlflow.__version__}")

MLflow version: 2.13.0


## Q2. Download and preprocess the data

In [2]:
# Copy preprocess_data.py script to homework folder
!cp  ../../cohorts/2024/02-experiment-tracking/homework/preprocess_data.py preprocess_data.py
!ls

homework-02.ipynb  preprocess_data.py


In [3]:
# Create taxi data folder to preprocess
!mkdir -p taxi_data_folder

In [9]:
# Download taxi data
!wget -P taxi_data_folder/ https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
!wget -P taxi_data_folder/ https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
!wget -P taxi_data_folder/ https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet

--2024-05-25 12:59:30--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
Resolviendo d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 99.84.245.157, 99.84.245.193, 99.84.245.9, ...
Conectando con d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)[99.84.245.157]:443... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 1427002 (1,4M) [binary/octet-stream]
Guardando como: ‘taxi_data_folder/green_tripdata_2023-01.parquet’


2024-05-25 12:59:31 (1,65 MB/s) - ‘taxi_data_folder/green_tripdata_2023-01.parquet’ guardado [1427002/1427002]

--2024-05-25 12:59:32--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
Resolviendo d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 99.84.245.193, 99.84.245.141, 99.84.245.9, ...
Conectando con d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)[99.84.245.193]:443... conectado.
Petición HTTP enviada, esperando respuesta..

In [10]:
# Run preprocess_data.py
!python3 preprocess_data.py --raw_data_path taxi_data_folder/ --dest_path ./processed_data 

In [13]:
processed_data_folder = 'processed_data'
print(f"Number of files saved to 'processed_data' folder: {len(os.listdir(processed_data_folder))}")


Number of files saved to 'processed_data' folder: 4


## Q3. Train a model with autolog

In [14]:
# Copy train.py script into homework folder
!cp ../../cohorts/2024/02-experiment-tracking/homework/train.py train.py
!ls

homework-02.ipynb   processed_data    train.py
preprocess_data.py  taxi_data_folder


Edit `train.py` script to add MLflow tracking

In [22]:
# Run train script with mlflow changes
!python3 train.py --data_path $processed_data_folder



In [23]:
# Check in mlflow UI if experiment was tracked
!mlflow ui --backend-store-uri sqlite:///mlflow.db

[2024-05-25 14:22:09 -0300] [11698] [INFO] Starting gunicorn 22.0.0
[2024-05-25 14:22:09 -0300] [11698] [INFO] Listening at: http://127.0.0.1:5000 (11698)
[2024-05-25 14:22:09 -0300] [11698] [INFO] Using worker: sync
[2024-05-25 14:22:09 -0300] [11699] [INFO] Booting worker with pid: 11699
[2024-05-25 14:22:09 -0300] [11700] [INFO] Booting worker with pid: 11700
[2024-05-25 14:22:09 -0300] [11701] [INFO] Booting worker with pid: 11701
[2024-05-25 14:22:09 -0300] [11702] [INFO] Booting worker with pid: 11702
^C
[2024-05-25 14:24:01 -0300] [11698] [INFO] Handling signal: int
[2024-05-25 14:24:02 -0300] [11699] [INFO] Worker exiting (pid: 11699)
[2024-05-25 14:24:02 -0300] [11700] [INFO] Worker exiting (pid: 11700)
[2024-05-25 14:24:02 -0300] [11701] [INFO] Worker exiting (pid: 11701)
[2024-05-25 14:24:02 -0300] [11702] [INFO] Worker exiting (pid: 11702)


Min samples split parameter is set to 2

## Q4. Launch the tracking server locally

In [25]:
# Create artifacts folder
!mkdir -p artifacts/ 

Code to run server with backend database and folder to store artifacts (Run from `02-experiment-tracking/homework/` directory):

```bash
mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts 
```

## Q5. Tune model hyperparameters

In [26]:
# Copy opt.py script
!cp ../../cohorts/2024/02-experiment-tracking/homework/hpo.py hpo.py
!ls

artifacts	   hpo.py     mlruns		  processed_data    train.py
homework-02.ipynb  mlflow.db  preprocess_data.py  taxi_data_folder


In [36]:
# First, make sure that mlflow server is running and then, run hpo.py
!python3 hpo.py --data_path $processed_data_folder

2024/05/25 18:40:42 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.















100%|██████████| 15/15 [02:08<00:00,  8.58s/trial, best loss: 5.335419588556921]


Best validation RMSE for this run is: 5.335

## Q6. Promote the best model to the model registry

In [28]:
# copy register_model.py script
!cp ../../cohorts/2024/02-experiment-tracking/homework/register_model.py register_model.py
!ls

artifacts	   mlflow.db	       processed_data	  train.py
homework-02.ipynb  mlruns	       register_model.py
hpo.py		   preprocess_data.py  taxi_data_folder


These cells corresponds to code testing done in order to change `register_model.py` script correctly

In [45]:
from mlflow import MlflowClient
from mlflow.entities import ViewType
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [38]:
client = MlflowClient()
experiment = client.get_experiment_by_name("random-forest-hyperopt")

In [44]:
# Experiment attributes
for key, value in experiment.__dict__.items():
    print(f"'{key}' -> '{value}'")

'_experiment_id' -> '2'
'_name' -> 'random-forest-hyperopt'
'_artifact_location' -> '/home/dpaez/freelance/mlops-datatalksclub/mlops-zoomcamp/02-experiment-tracking/homework/artifacts/2'
'_lifecycle_stage' -> 'active'
'_tags' -> '{}'
'_creation_time' -> '1716673242388'
'_last_update_time' -> '1716673242388'


In [46]:
top_n = 5

runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=top_n,
    order_by=["metrics.rmse ASC"]
)

In [49]:
# Runs attributes
for run in runs:
    for key, value in run.__dict__.items():
        print(f"'{key}' -> '{value}'")

'_info' -> '<RunInfo: artifact_uri='/home/dpaez/freelance/mlops-datatalksclub/mlops-zoomcamp/02-experiment-tracking/homework/artifacts/2/b4e71f99c1f5474e9820bcdddf5d696c/artifacts', end_time=1716673346661, experiment_id='2', lifecycle_stage='active', run_id='b4e71f99c1f5474e9820bcdddf5d696c', run_name='luminous-bat-877', run_uuid='b4e71f99c1f5474e9820bcdddf5d696c', start_time=1716673339457, status='FINISHED', user_id='dpaez'>'
'_data' -> '<RunData: metrics={'rmse': 5.335419588556921}, params={'max_depth': '19',
 'min_samples_leaf': '2',
 'min_samples_split': '2',
 'n_estimators': '11',
 'random_state': '42'}, tags={'mlflow.runName': 'luminous-bat-877',
 'mlflow.source.git.commit': 'd45f2842655eb720b9d8aeba8e07ff1e037d3f55',
 'mlflow.source.name': 'hpo.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'dpaez',
 'model': 'RandomForestRegressor'}>'
'_inputs' -> '<RunInputs: dataset_inputs=[]>'
'_info' -> '<RunInfo: artifact_uri='/home/dpaez/freelance/mlops-datatalksclub/mlops-zoomcamp/

Run `register_model.py` updated script to test best models and promote best of them to model registry

In [51]:
!python3 register_model.py --data_path $processed_data_folder

2024/05/25 19:03:19 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'random-forest-champion'.
2024/05/25 19:05:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest-champion, version 1
Created version '1' of model 'random-forest-champion'.


In [55]:
# Retrieve registered model
model_name = 'random-forest-champion'
registered_model = client.get_registered_model('random-forest-champion')
registered_model

<RegisteredModel: aliases={}, creation_timestamp=1716674715121, description='', last_updated_timestamp=1716674715244, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1716674715244, current_stage='None', description='', last_updated_timestamp=1716674715244, name='random-forest-champion', run_id='0018f06784a142018e890bb1a6f5a953', run_link='', source='/home/dpaez/freelance/mlops-datatalksclub/mlops-zoomcamp/02-experiment-tracking/homework/artifacts/3/0018f06784a142018e890bb1a6f5a953/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>], name='random-forest-champion', tags={}>

In [64]:
# Get info about latest registered model in model name = 'random-forest-champion'
latest_version = registered_model._latest_version

# Get metrics
champion_model_run = client.get_run(latest_version[0].run_id)
best_test_rmse = champion_model_run.data.metrics["test_rmse"]

In [66]:
print(f"The test RMSE of the best model is: {best_test_rmse:.3f}")

The test RMSE of the best model is: 5.567
