In [1]:
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

## Q1. What's the version that you have?

In [2]:
print('The version of my mlflow is:', mlflow.__version__)

The version of my mlflow is: 1.26.1


## Q2. How many files were saved to OUTPUT_FOLDER?

In [14]:
! python ./preprocess_data.py --raw_data_path . --dest_path ./output

In [4]:
print('There are four files saved: dv, test, train and valid')

There are four files saved: dv, test, train and valid


## Q3. How many parameters are automatically logged by MLflow?

In [6]:
import os
os.environ["GIT_PYTHON_REFRESH"] = "quiet"
import git

In [15]:
! python ./train.py --data_path ./output

2022/05/30 17:11:59 INFO mlflow.tracking.fluent: Experiment with name 'mlops-experiment' does not exist. Creating a new experiment.


There are 17 parameters
<img src="1.png">

## Q4. In addition to backend-store-uri, what else do you need to pass to properly configure the server?

mlflow ui --backend-store-uri file:///Users/gpires/mlruns --default-artifact-root ./artifacts

## Q5. What's the best validation RMSE that you got?

In [16]:
! python ./hpo.py --data_path ./output


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]
  2%|2         | 1/50 [00:17<14:02, 17.20s/trial, best loss: 6.658956269343007]
  4%|4         | 2/50 [00:17<05:57,  7.44s/trial, best loss: 6.658956269343007]
  6%|6         | 3/50 [00:18<03:33,  4.54s/trial, best loss: 6.658956269343007]
  8%|8         | 4/50 [00:32<06:08,  8.01s/trial, best loss: 6.651438559376775]
 10%|#         | 5/50 [00:37<05:11,  6.92s/trial, best loss: 6.651438559376775]
 12%|#2        | 6/50 [00:56<08:15, 11.26s/trial, best loss: 6.651438559376775]
 14%|#4        | 7/50 [01:15<09:49, 13.71s/trial, best loss: 6.651438559376775]
 16%|#6        | 8/50 [01:17<06:54,  9.88s/trial, best loss: 6.651438559376775]
 18%|#8        | 9/50 [01:28<06:55, 10.14s/trial, best loss: 6.651438559376775]
 20%|##        | 10/50 [01:37<06:32,  9.81s/trial, best loss: 6.651438559376775]
 22%|##2       | 11/50 [01:45<06:04,  9.35s/trial, best loss: 6.642137287429206]
 24%|##4       | 12/50 [01:51<05:15,  8.30s/trial, best loss: 

In [23]:
MLFLOW_TRACKING_URI = "file:///Users/gpires/mlruns"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 10",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: 6374d61772234a8896167bcfe4060246, rmse: 6.6284
run id: b137e0630c4d4c7eaf1aab651fb40040, rmse: 6.6297
run id: b9a511c5b2694cbbba8c7735b2f1b027, rmse: 6.6299
run id: cb28ef5fb0bd4d8b8aa1e5d7a80e60c1, rmse: 6.6299
run id: d451dc69f0e04661a01039b6cbb8f5ef, rmse: 6.6299


In [24]:
print('The best rmse was 6.6284')

The best rmse was 6.6284


## Q6. What is the test RMSE of the best model?

In [26]:
! python ./register_model.py --data_path ./output

In [33]:
MLFLOW_TRACKING_URI = "file:///Users/gpires/mlruns"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

runs = client.search_runs(
    experiment_ids='3',
    filter_string="metrics.test_rmse < 10",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['test_rmse']:.4f}")

run id: c44791883643485497b78daf31226bc9, rmse: 6.5489
run id: 46a9d8f147a94dfda3571a5cf66fab48, rmse: 6.5491
run id: 1528d15a554d4ef6beffba1ccc8af0dd, rmse: 6.5491
run id: a85ac50e475a47e48d04cb331def1b4c, rmse: 6.5498
run id: 5964c59b947f4d20a2447ddc911353a5, rmse: 6.5479


In [34]:
print('The best rmse was 6.5489')

The best rmse was 6.5489
