<h2>Getting familiar with MLflow</h2>

In [54]:
# import packages
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import mlflow
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error


<h3>Q1. Install MLflow</h3>

In [2]:
# Check the version of MLflow
print(mlflow.__version__)

2.13.0


<h3>Q2. Download and preprocess the data</h3>

In [3]:
# download the data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet

--2024-05-28 21:35:54--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.163.157.96, 3.163.157.7, 3.163.157.133, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.163.157.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1427002 (1.4M) [binary/octet-stream]
Saving to: ‘green_tripdata_2023-01.parquet’


2024-05-28 21:35:54 (4.09 MB/s) - ‘green_tripdata_2023-01.parquet’ saved [1427002/1427002]

--2024-05-28 21:35:55--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.163.157.7, 3.163.157.133, 3.163.157.96, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.163.157.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1533740 (1.5M) [binary/octet-stream]
Saving to: ‘g

```python
# execute the command in terminal
python preprocess_data.py --raw_data_path data --dest_path ./output

```

<h3>Q3. Train a model with autolog</h3>

```python
# execute the command in terminal
python train_model.py
```

<h3>Q5. Tune model hyperparameters</h3>

In [26]:
# import the mlflow.client class
from mlflow.tracking import MlflowClient

# set tracking server
TRACKING_SERVER_HOST = "127.0.0.1"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

In [27]:
# initialize the client
client = MlflowClient(f"http://{TRACKING_SERVER_HOST}:5000")

In [28]:
# List all experiments using mlflow.search_experiments method
experiments = client.search_experiments()
experiments

[<Experiment: artifact_location=('/Users/chuksokoli/Documents/mlops zoomcamp 2024/02 - Experiment '
  'Tracking/homework/artifacts_local/1'), creation_time=1716965086612, experiment_id='1', last_update_time=1716965086612, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>,
 <Experiment: artifact_location=('/Users/chuksokoli/Documents/mlops zoomcamp 2024/02 - Experiment '
  'Tracking/homework/artifacts_local/0'), creation_time=1716964325321, experiment_id='0', last_update_time=1716964325321, lifecycle_stage='active', name='Default', tags={}>]

In [29]:
# get best runs, filter runs and order by rmse
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 5.35",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [30]:
runs

[<Run: data=<RunData: metrics={'rmse': 5.335419588556921}, params={'max_depth': '19',
  'min_samples_leaf': '2',
  'min_samples_split': '2',
  'n_estimators': '11',
  'random_state': '42'}, tags={'developer': 'chuks',
  'logtype': 'auto',
  'mlflow.log-model.history': '[{"run_id": "710002e39a06460cb003259a5d051c3d", '
                              '"artifact_path": "random-forest-model", '
                              '"utc_time_created": "2024-05-29 '
                              '06:45:19.265006", "flavors": {"python_function": '
                              '{"model_path": "model.pkl", "predict_fn": '
                              '"predict", "loader_module": "mlflow.sklearn", '
                              '"python_version": "3.9.19", "env": {"conda": '
                              '"conda.yaml", "virtualenv": "python_env.yaml"}}, '
                              '"sklearn": {"pickled_model": "model.pkl", '
                              '"sklearn_version": "1.5.0", '
          

In [36]:
# promote model to model registry
run_id = runs[0].info.run_id
logged_model = f'runs:/{run_id}/random-forest-model'

# register a new version of the model in mlflow
mlflow.register_model(model_uri=logged_model, name='nyc-taxi-with-random-forest')


Successfully registered model 'nyc-taxi-with-random-forest'.
2024/05/28 23:24:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-with-random-forest, version 1
Created version '1' of model 'nyc-taxi-with-random-forest'.


<ModelVersion: aliases=[], creation_timestamp=1716967487448, current_stage='None', description='', last_updated_timestamp=1716967487448, name='nyc-taxi-with-random-forest', run_id='710002e39a06460cb003259a5d051c3d', run_link='', source=('/Users/chuksokoli/Documents/mlops zoomcamp 2024/02 - Experiment '
 'Tracking/homework/artifacts_local/1/710002e39a06460cb003259a5d051c3d/artifacts/random-forest-model'), status='READY', status_message='', tags={}, user_id='', version='1'>

In [40]:
# update the model 
from datetime import datetime
date = datetime.today().date()

model_version = 1
new_stage = "Staging"
client.update_model_version(
    name='nyc-taxi-with-random-forest', 
    version=model_version,
    description=f"This version of random forest is version {model_version} and is being transitioned to {new_stage} on {date}."

)

<ModelVersion: aliases=[], creation_timestamp=1716967487448, current_stage='Staging', description=('This version of random forest is version 1 and is being transitioned to '
 'Staging on 2024-05-28.'), last_updated_timestamp=1716969069037, name='nyc-taxi-with-random-forest', run_id='710002e39a06460cb003259a5d051c3d', run_link='', source=('/Users/chuksokoli/Documents/mlops zoomcamp 2024/02 - Experiment '
 'Tracking/homework/artifacts_local/1/710002e39a06460cb003259a5d051c3d/artifacts/random-forest-model'), status='READY', status_message='', tags={}, user_id='', version='1'>

In [43]:
# transition model to a differnt stage
new_stage="Production"
client.transition_model_version_stage(
    name='nyc-taxi-with-random-forest', 
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)


  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1716967487448, current_stage='Production', description=('This version of random forest is version 1 and is being transitioned to '
 'Staging on 2024-05-28.'), last_updated_timestamp=1716969199385, name='nyc-taxi-with-random-forest', run_id='710002e39a06460cb003259a5d051c3d', run_link='', source=('/Users/chuksokoli/Documents/mlops zoomcamp 2024/02 - Experiment '
 'Tracking/homework/artifacts_local/1/710002e39a06460cb003259a5d051c3d/artifacts/random-forest-model'), status='READY', status_message='', tags={}, user_id='', version='1'>

In [47]:
# read the data and remove outliers
def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

# preprocess the dataset
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [48]:
# using prebuilt function to decide which model to promote to Production with test data
df = read_dataframe("./data/green_tripdata_2023-03.parquet")
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2023-03-01 00:25:10,2023-03-01 00:35:47,N,1.0,82,196,1.0,2.36,13.5,...,0.5,0.0,0.0,,1.0,16.0,2.0,1.0,0.0,10.616667
1,2,2023-03-01 00:14:29,2023-03-01 00:25:04,N,1.0,7,7,1.0,0.78,-6.5,...,-0.5,0.0,0.0,,-1.0,-9.0,3.0,1.0,0.0,10.583333
2,2,2023-03-01 00:14:29,2023-03-01 00:25:04,N,1.0,7,7,1.0,0.78,6.5,...,0.5,0.0,0.0,,1.0,9.0,3.0,1.0,0.0,10.583333
3,2,2023-02-28 22:59:46,2023-02-28 23:08:38,N,1.0,166,74,1.0,1.66,11.4,...,0.5,2.78,0.0,,1.0,16.68,1.0,1.0,0.0,8.866667
4,2,2023-03-01 00:54:03,2023-03-01 01:03:14,N,1.0,236,229,1.0,3.14,15.6,...,0.5,4.17,0.0,,1.0,25.02,1.0,1.0,2.75,9.183333


In [55]:
import pickle

def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

X_test, y_test = load_pickle(os.path.join('./output', "test.pkl"))

In [59]:
%time test_model(name='nyc-taxi-with-random-forest', stage=new_stage, X_test=X_test, y_test=y_test)

  latest = client.get_latest_versions(name, None if stage is None else [stage])


CPU times: user 209 ms, sys: 50.8 ms, total: 260 ms
Wall time: 364 ms




{'rmse': 5.567408012462019}