# Cleanup Model Monitoring Outputs

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

::: {.content-hidden}
Import necessary Python modules
:::

In [2]:
import os
import shutil
import sys
from glob import glob

import mlflow.sklearn
import pandas as pd

::: {.content-hidden}
Get relative path to project root directory
:::

In [3]:
PROJ_ROOT_DIR = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT_DIR, "src")
sys.path.append(src_dir)

::: {.content-hidden}
Import custom Python modules
:::

In [4]:
%aimport model_helpers
import model_helpers as modh

## About

### Overview
This step cleans up all project resources that were created, including

1. intermediate (non-MLFlow) files
2. MLFlow outputs (artifacts, [metrics](https://mlflow.org/docs/latest/tracking.html#performance-tracking-with-metrics), etc.)

### Order of Operations

This step can be run at the end of the project, after the campaign has ended and its outcomes have been analyzed.

## User Inputs

Define the following

1. file name prefix for the following MLFlow artifacts associated with the best deployment candidate MLFLow ML model
   - processed data
   - threshold tuning data, where predictions per threshold were evaluated using the test data split
   - estimated sample sizes, where sizes were estimated using the test data split
   - audience cohorts created using the inference data split
   - audience profiles using the inference data split
   - post-campaign impact evaluation

In [5]:
#| echo: true
artifact_file_prefixes = [
    'processed_data',
    "threshold_tuning_data",
    "audience_sample_sizes",
    "audience_cohorts",
    "audience_profiles",
    "campaign_analysis",
]

::: {.content-hidden}
Get path to data sub-folders
:::

In [6]:
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
raw_data_dir = os.path.join(data_dir, "raw")
processed_data_dir = os.path.join(data_dir, "processed")

::: {.content-hidden}
Define MLFlow storage paths
:::

In [24]:
mlruns_db_fpath = f"{raw_data_dir}/mlruns.db"
mlflow_artifact_fpath = os.path.relpath(os.path.join(os.getcwd(), "mlruns"))
mlflow.set_tracking_uri(f"sqlite:///{mlruns_db_fpath}")

::: {.content-hidden}
Set environment variable to silence MLFlow `git` warning messsage
:::

In [8]:
os.environ["GIT_PYTHON_REFRESH"] = "quiet"

::: {.content-hidden}
Define a helper function to get the filepath to an MLFlow artifact and the associated local (non-MLFlow) file
:::

In [10]:
def show_artifact_and_local_filepath_for_run_id(
    df: pd.DataFrame, file_prefix: str, local_dir: str
) -> pd.DataFrame:
    """Get artifact and local file associated with a run ID."""
    artifact_par_dir = mlflow.artifacts.download_artifacts(
        run_id=df.squeeze()["run_id"]
    )

    glob_str = f"{file_prefix}__run_*.parquet.gzip"
    artifact_glob_dir_str = os.path.join(artifact_par_dir, glob_str)
    artifact_fpath = glob(artifact_glob_dir_str)[-1]

    artifact_dir = os.path.basename(artifact_fpath)
    local_fpath = os.path.join(local_dir, artifact_dir)
    return [artifact_fpath, local_fpath]

## Get Data

### Fetch Latest Version of Best Deployment Candidate Model

Get best deployment candidate model from model registry

In [9]:
#|echo: true
df_candidate_mlflow_models = modh.get_all_deployment_candidate_models()

### Get Filepaths to MLFlow File Artifacts

Get the filepath to the MLFlow artifacts associated with the best deployment candidate model, as well as the filepath to the corresponding local file used when logging the artifact

In [None]:
#|echo: true
artifacts_dict = {
    k: show_artifact_and_local_filepath_for_run_id(
        df_candidate_mlflow_models, k, processed_data_dir
    )
    for k in artifact_file_prefixes
}

## Cleanup

### (Non-MLFlow) Local Outputs

In [None]:
#|echo: true
#|output: false
for k,v in artifacts_dict.items():
    os.remove(v[1])
    print(f"Deleted: {os.path.basename(v[1]).split('__run')[0]}")

### (Non-MLFlow) ML Development Experiment Outputs

In [26]:
#|echo: true
#|output: false
ml_runs_glob = os.path.join(raw_data_dir, "ml__run_*__expt_*.parquet*")
for local_fpath in glob(ml_runs_glob):
    os.remove(local_fpath)
    print(f"Deleted: {os.path.basename(local_fpath).split('__run')[0]}")

### MLFlow Outputs

Delete the [MLFlow run-logging database file](https://mlflow.org/docs/latest/tracking.html#scenario-2-mlflow-on-localhost-with-sqlite)

In [22]:
#|echo: true
#|output: false
os.remove(mlruns_db_fpath)
print(f"Removed MLFlow database at {os.path.basename(mlruns_db_fpath)}")

Delete the [MLFlow artifact directory (`mlruns`)](https://mlflow.org/docs/latest/tracking.html#scenario-1-mlflow-on-localhost), and all the artifact sub-directories contained in that directory

In [25]:
#| echo: true
#|output: false
shutil.rmtree(mlflow_artifact_fpath)
print(
    "Removed local MLFlow artifact logging directory at "
    f"{os.path.basename(mlflow_artifact_fpath)}"
)

## Next Step

The next step will clean up all project resources related to BigQuery.