#### Question 1. Install MLflow (1 point)

In [3]:
import mlflow
mlflow.__version__

'2.13.0'

#### Question 2. Download and preprocess the data (1 point)

In [6]:
!ls /mlops-zoomcamp/cohorts/2024/02-experiment-tracking/homework/output

dv.pkl    test.pkl  train.pkl val.pkl


#### Question 3. Train a model with autolog (1 point)

##### train_modified.py:

In [None]:
import os
import pickle
import click
import mlflow
import mlflow.sklearn

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


@click.command()
@click.option(
    "--data_path",
    default="./output",
    help="Location where the processed NYC taxi trip data was saved"
)
def run_train(data_path: str):
    # Autolog enable
    mlflow.sklearn.autolog()

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    with mlflow.start_run():
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)
        print(f"RMSE: {rmse}")


if __name__ == '__main__':
    run_train()

In [6]:
%run train_modified.py --data_path output



RMSE: 5.431162180141208


default value of min_samples_split = 2

#### Question 4. Launch the tracking server locally (1 point)

In [None]:
!mlflow server \
    --backend-store-uri sqlite:///mlruns/mlflow.db \
    --default-artifact-root ./artifacts

#### Question 5. Tune model hyperparameters (1 point)

In [1]:
%run hpo.py

100%|██████████| 15/15 [00:34<00:00,  2.30s/trial, best loss: 5.335419588556921]
Best hyperparameters:  {'max_depth': 19.0, 'min_samples_leaf': 2.0, 'min_samples_split': 2.0, 'n_estimators': 11.0}


#### Question 6. Promote the best model to the model registry (1 point)

In [3]:
%run register_model.py --data_path ./output --top_n 5

Registered model 'best-random-forest-model' already exists. Creating a new version of this model...
2024/05/29 23:23:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best-random-forest-model, version 3


Best model run ID: 26ff5b64796c4aceb1f57de545f6f303
Test RMSE of the best model: 5.567408012462019


Created version '3' of model 'best-random-forest-model'.
