<h2>Getting familiar with MLflow</h2>

In [1]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import mlflow
import preprocess_data


<h3>Q1. Install MLflow</h3>

In [2]:
# Check the version of MLflow
print(mlflow.__version__)

2.13.0


<h3>Q2. Download and preprocess the data</h3>

In [3]:
# download the data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet

--2024-05-28 21:35:54--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.163.157.96, 3.163.157.7, 3.163.157.133, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.163.157.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1427002 (1.4M) [binary/octet-stream]
Saving to: ‘green_tripdata_2023-01.parquet’


2024-05-28 21:35:54 (4.09 MB/s) - ‘green_tripdata_2023-01.parquet’ saved [1427002/1427002]

--2024-05-28 21:35:55--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.163.157.7, 3.163.157.133, 3.163.157.96, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.163.157.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1533740 (1.5M) [binary/octet-stream]
Saving to: ‘g

```python
# execute the command in terminal
python preprocess_data.py --raw_data_path data --dest_path ./output

```

<h3>Q3. Train a model with autolog</h3>

```python
# execute the command in terminal
python train_model.py
```

<h3>Q5. Tune model hyperparameters</h3>

In [26]:
# import the mlflow.client class
from mlflow.tracking import MlflowClient

# set tracking server
TRACKING_SERVER_HOST = "127.0.0.1"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

In [27]:
# initialize the client
client = MlflowClient(f"http://{TRACKING_SERVER_HOST}:5000")

In [28]:
# List all experiments using mlflow.search_experiments method
experiments = client.search_experiments()
experiments

[<Experiment: artifact_location=('/Users/chuksokoli/Documents/mlops zoomcamp 2024/02 - Experiment '
  'Tracking/homework/artifacts_local/1'), creation_time=1716965086612, experiment_id='1', last_update_time=1716965086612, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>,
 <Experiment: artifact_location=('/Users/chuksokoli/Documents/mlops zoomcamp 2024/02 - Experiment '
  'Tracking/homework/artifacts_local/0'), creation_time=1716964325321, experiment_id='0', last_update_time=1716964325321, lifecycle_stage='active', name='Default', tags={}>]

In [29]:
# get best runs, filter runs and order by rmse
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 5.35",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [30]:
runs

[<Run: data=<RunData: metrics={'rmse': 5.335419588556921}, params={'max_depth': '19',
  'min_samples_leaf': '2',
  'min_samples_split': '2',
  'n_estimators': '11',
  'random_state': '42'}, tags={'developer': 'chuks',
  'logtype': 'auto',
  'mlflow.log-model.history': '[{"run_id": "710002e39a06460cb003259a5d051c3d", '
                              '"artifact_path": "random-forest-model", '
                              '"utc_time_created": "2024-05-29 '
                              '06:45:19.265006", "flavors": {"python_function": '
                              '{"model_path": "model.pkl", "predict_fn": '
                              '"predict", "loader_module": "mlflow.sklearn", '
                              '"python_version": "3.9.19", "env": {"conda": '
                              '"conda.yaml", "virtualenv": "python_env.yaml"}}, '
                              '"sklearn": {"pickled_model": "model.pkl", '
                              '"sklearn_version": "1.5.0", '
          

In [36]:
# promote model to model registry
run_id = runs[0].info.run_id
logged_model = f'runs:/{run_id}/random-forest-model'

# register a new version of the model in mlflow
mlflow.register_model(model_uri=logged_model, name='nyc-taxi-with-random-forest')


Successfully registered model 'nyc-taxi-with-random-forest'.
2024/05/28 23:24:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-with-random-forest, version 1
Created version '1' of model 'nyc-taxi-with-random-forest'.


<ModelVersion: aliases=[], creation_timestamp=1716967487448, current_stage='None', description='', last_updated_timestamp=1716967487448, name='nyc-taxi-with-random-forest', run_id='710002e39a06460cb003259a5d051c3d', run_link='', source=('/Users/chuksokoli/Documents/mlops zoomcamp 2024/02 - Experiment '
 'Tracking/homework/artifacts_local/1/710002e39a06460cb003259a5d051c3d/artifacts/random-forest-model'), status='READY', status_message='', tags={}, user_id='', version='1'>

In [37]:
import os
import pickle
import click

def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)
    
@click.command()
@click.option(
    "--data_path",
    default="./output",
    help="Location where the processed NYC taxi trip data was saved"
)
    
X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))


# making prediction

#y_pred = xgboost_model.predict(valid)

SyntaxError: invalid syntax (3754403422.py, line 16)