# Evidently Dashboard Prep

In [5]:
import os
import sys
import cdsw
import pickle
import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset

# Install cmlapi package
try:
    import cmlapi
except ModuleNotFoundError:
    import os

    cluster = os.getenv("CDSW_API_URL")[:-1] + "2"
    !pip3 install {cluster}/python.tar.gz
    import cmlapi

from utils.utils import get_latest_deployment_details
from utils.inference_utils import ThreadedModelRequest, cast_date_as_str_for_json

%load_ext autoreload
%autoreload 2
%load_ext lab_black

[autoreload of pandas.compat failed: Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/usr/local/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/usr/local/lib/python3.8/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/usr/local/lib/python3.8/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 843, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/home/cdsw/.local/lib/python3.8/site-packages/pandas/compat/__init__.py", line 15, in <module>
    from pandas.compat.numpy import (
ImportError: cannot import name 'is_numpy_dev' from 'pandas.compat.numpy' (/home/cds

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


## Simulation

### 1. Score Train Data

In [4]:
# format train data for inference
train_df = pd.read_pickle("../data/working/train_df.pkl")
train_records = cast_date_as_str_for_json(train_df).to_dict(orient="records")

AttributeError: Can't get attribute 'new_block' on <module 'pandas.core.internals.blocks' from '/home/cdsw/.local/lib/python3.8/site-packages/pandas/core/internals/blocks.py'>

In [None]:
# obtain model deployment info
client = cmlapi.default_client()
latest_deployment_details = get_latest_deployment_details(
    client=client, model_name="Price Regressor8"
)

In [5]:
%%time
tmr = ThreadedModelRequest(deployment_details=latest_deployment_details)
train_inference_metadata = tmr.threaded_call(train_records)

CPU times: user 42 s, sys: 1.16 s, total: 43.1 s
Wall time: 7min 1s


In [6]:
len(train_inference_metadata["id_uuid_mapping"])

11741

In [7]:
# with open("train_inference_metadata.pkl", "wb") as f:
#     pickle.dump(train_inference_metadata, f)

In [8]:
# with open("train_inference_metadata.pkl", "rb") as f:
#     train_inference_metadata = pickle.load(f)

#### Create master id <--> uuid mapping: this gets populated as soon as a property is "listed"

In [9]:
master_id_uuid_mapping = {}
master_id_uuid_mapping.update(train_inference_metadata["id_uuid_mapping"])

In [10]:
len(master_id_uuid_mapping)

11741

### 1.a Add delayed metrics

In [14]:
def add_delayed_metrics(uuids, ground_truths, sold_dates):
    """
    Add delayed metrics to CML Model Metrics database provided a
    list of prediction UUID's and corresponding list of ground truth values.

    """

    if len(uuids) != len(ground_truths) != len(sold_dates):
        raise ValueError(
            "UUIDs, ground_truths, and sold_dates must be of same length and correspond by index."
        )

    for uuid, gt, ds in zip(uuids, ground_truths, sold_dates):
        cdsw.track_delayed_metrics(
            metrics={"ground_truth": gt, "date_sold": ds}, prediction_uuid=uuid
        )

    print(f"Sucessfully added ground truth values to {len(uuids)} records.")

In [15]:
# get list of uuids from train_inference_metadata
train_uuids = list(train_inference_metadata["id_uuid_mapping"].values())

# get list of prices
train_gts = train_df[
    train_df.id.isin(train_inference_metadata["id_uuid_mapping"].keys())
].price.tolist()

# get list of sold_dates
train_sold_dates = train_df[
    train_df.id.isin(train_inference_metadata["id_uuid_mapping"].keys())
].date_sold.tolist()

In [16]:
%%time

add_delayed_metrics(train_uuids, train_gts, train_sold_dates)

Sucessfully added ground truth values to 11741 records.
CPU times: user 2min 3s, sys: 3.94 s, total: 2min 7s
Wall time: 3min 45s


### 1.b Query the metadata store

In [17]:
metrics = cdsw.read_metrics(
    model_deployment_crn=latest_deployment_details["latest_deployment_crn"],
    start_timestamp_ms=train_inference_metadata["start_timestamp_ms"],
    end_timestamp_ms=train_inference_metadata["end_timestamp_ms"],
)

In [18]:
from typing import Dict


def format_model_metrics_query(metrics: Dict):
    """
    Accepts the response dictionary from `cdsw.read_metrics()`, filters out any non-metrics columns,
    and formats as Dataframe.
    """
    metrics = pd.json_normalize(metrics["metrics"])

    return metrics[
        [col for col in metrics.columns if col.split(".")[0] == "metrics"]
        + ["predictionUuid"]
    ].rename(columns={col: col.split(".")[-1] for col in metrics.columns})

In [19]:
train_metrics_df = format_model_metrics_query(metrics)

In [20]:
train_metrics_df

Unnamed: 0,date_sold,ground_truth,view,zipcode,bedrooms,sqft_lot,bathrooms,condition,waterfront,sqft_living,predicted_result,predictionUuid
0,2014-07-25,275000.0,0,98030,4,26114,1.00,5,0,1080,228819.312958,c6763e95-3460-487c-8a0a-fdf95e9a1267
1,2014-09-26,775000.0,0,98077,4,34513,2.50,3,0,3890,815129.692229,6e442c4f-572f-4f54-ae48-07454343fb08
2,2014-07-21,265000.0,0,98022,3,10489,1.50,5,0,1560,262579.397453,4b45150c-988a-44a4-aee4-f910a20feaa9
3,2014-07-22,372500.0,0,98126,2,2958,2.50,3,0,1400,377354.064458,86668c4e-4a63-44c8-a043-ca966e4dd3b7
4,2014-06-03,235000.0,0,98056,3,15603,1.00,4,0,1250,288589.081822,51f4e54c-eff2-41a6-86a2-1088f657c4a5
...,...,...,...,...,...,...,...,...,...,...,...,...
11736,2014-05-02,308500.0,0,98155,2,6174,1.00,4,0,850,287682.216986,bf47bf4d-fd2b-4f2d-b3bc-b66340f445dd
11737,2014-10-27,1049990.0,0,98053,5,9588,3.25,3,0,4240,993825.418767,206b1531-28e7-42b6-8320-567fc93441c0
11738,2014-06-10,589900.0,0,98027,4,35889,4.50,3,0,3870,875723.661588,d24a3379-5fc8-4a90-9491-fafd9d9a29d6
11739,2014-07-08,600000.0,0,98034,4,11370,2.50,3,0,2250,500175.417481,8a5a34da-fd62-440f-9895-613038ae7a13


### 2. Create Simulation

In [39]:
prod_path = "../data/working/prod_df.pkl"
prod_df = pd.read_pickle(prod_path)

In [40]:
# number of months in prod set
n_months = int(
    np.ceil(
        (prod_df.date_sold.max() - prod_df.date_sold.min()) / np.timedelta64(1, "M")
    )
)

In [41]:
# construct date ranges to iterate through as simulation of time (include left, exclude right)

date_ranges = [
    [
        (prod_df.date_sold.min() + DateOffset(months=n)),
        (prod_df.date_sold.min() + DateOffset(months=n + 2)),
    ]
    for n in range(0, n_months, 2)
]

# increase first date range to account for records that listed during the train_df timeframe
date_ranges[0][0] = date_ranges[0][0] - DateOffset(years=1)

In [24]:
date_ranges

[[Timestamp('2013-11-01 00:00:00'), Timestamp('2015-01-01 00:00:00')],
 [Timestamp('2015-01-01 00:00:00'), Timestamp('2015-03-01 00:00:00')],
 [Timestamp('2015-03-01 00:00:00'), Timestamp('2015-05-01 00:00:00')],
 [Timestamp('2015-05-01 00:00:00'), Timestamp('2015-07-01 00:00:00')]]

### Logic for one loop

#### Query prod_df for new listings in new date range + make inference

In [25]:
new_listings_records = prod_df.loc[
    prod_df.date_listed.between(date_ranges[0][0], date_ranges[0][1], inclusive="left")
]

new_listings_records = cast_date_as_str_for_json(new_listings_records).to_dict(
    orient="records"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [26]:
%%time
tmr = ThreadedModelRequest(deployment_details=latest_deployment_details)
new_listing_inference_metadata = tmr.threaded_call(new_listings_records)

CPU times: user 13.6 s, sys: 310 ms, total: 13.9 s
Wall time: 2min 16s


In [27]:
len(new_listing_inference_metadata["id_uuid_mapping"])

3840

In [28]:
master_id_uuid_mapping.update(new_listing_inference_metadata["id_uuid_mapping"])

In [29]:
len(master_id_uuid_mapping)

15581

#### Query prod_df for newly sold properties in new date range + assign ground truth to records in metric store

In [42]:
new_sold_records = prod_df.loc[
    prod_df.date_sold.between(date_ranges[0][0], date_ranges[0][1], inclusive="left")
]

In [43]:
# get list of uuids from train_inference_metadata
new_sold_uuids = new_sold_records.id.apply(lambda x: master_id_uuid_mapping[x]).tolist()

In [46]:
# get list of ground truth prices for newly sold properties
new_sold_gts = prod_df[prod_df.id.isin(new_sold_records.id)].price.tolist()

# get list of sold_dates for newly sold properties
new_sold_dates = (
    prod_df[prod_df.id.isin(new_sold_records.id)].date_sold.astype(str).tolist()
)

In [48]:
%%time
add_delayed_metrics(new_sold_uuids, new_sold_gts, new_sold_dates)

Sucessfully added ground truth values to 2858 records.
CPU times: user 30.2 s, sys: 1.09 s, total: 31.3 s
Wall time: 55 s


#### Query metric store for newly sold records for evidently reporting

In [49]:
# because I cant query by UUID, I must query all records, then filter to new_sold by uuid
metrics = cdsw.read_metrics(
    model_deployment_crn=latest_deployment_details["latest_deployment_crn"]
)

metrics_df = format_model_metrics_query(metrics)

In [50]:
new_sold_metrics_df = metrics_df[metrics_df.predictionUuid.isin(new_sold_uuids)]

In [51]:
new_sold_metrics_df

Unnamed: 0,date_sold,ground_truth,view,zipcode,bedrooms,sqft_lot,bathrooms,condition,waterfront,sqft_living,predicted_result,predictionUuid
11741,2014-11-01,353000.0,0,98115,3,864,2.50,3,0,1250,4.823624e+05,547c8268-533d-44f3-b0dd-ac288048e513
11742,2014-11-01,1750000.0,0,98004,4,8975,2.75,3,0,3560,1.351848e+06,aed80c8e-8013-4d64-9648-642ebb0125e4
11743,2014-11-01,644500.0,0,98074,4,5342,2.50,3,0,2990,6.780784e+05,eb8dc1e6-a418-454a-9bbe-45587c05f582
11744,2014-11-01,399000.0,0,98065,3,3690,2.50,3,0,1740,3.829369e+05,2323a343-789a-4559-b360-1bc34627f8dd
11745,2014-11-02,825000.0,0,98117,4,4000,2.50,5,0,2560,7.390113e+05,585a579c-e08e-45d0-8de4-c5e39789b848
...,...,...,...,...,...,...,...,...,...,...,...,...
14594,2014-12-31,261000.0,0,98038,3,7686,1.75,3,0,1350,2.635794e+05,26b40278-706b-4df9-add9-178653007172
14595,2014-12-31,665000.0,0,98075,4,5936,2.50,3,0,2510,6.063297e+05,09d5f10a-de06-4bad-9973-d924f61c4e51
14596,2014-12-31,219200.0,0,98002,3,7000,2.00,4,0,1680,2.355372e+05,e4ab0287-7cd6-4fd7-a33a-d725d43b24f8
14597,2014-12-31,310000.0,0,98133,3,1361,2.00,3,0,1310,3.323658e+05,9580dce7-a862-44c6-815c-491b548c094d


## Build Reporting Dashboard

In [55]:
from evidently.dashboard import Dashboard
from evidently.tabs import DataDriftTab, NumTargetDriftTab, RegressionPerformanceTab

In [56]:
TARGET = 'ground_truth'
PREDICTION = 'predicted_result'
NUM_FEATURES = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot"]
CAT_FEATURES = ["waterfront", "zipcode", "condition", "view"]


column_map = {
    'target': TARGET,
    'prediction': PREDICTION,
    'numerical_features': NUM_FEATURES,
    'categorical_features': CAT_FEATURES
}

dashboard = Dashboard(tabs=[DataDriftTab, NumTargetDriftTab, RegressionPerformanceTab])
dashboard.calculate(reference_data=)


TypeError: calculate() missing 2 required positional arguments: 'reference_data' and 'current_data'

In [60]:
train_metrics_df.head()

TypeError: descriptor 'axes' for 'BlockManager' objects doesn't apply to a 'BlockManager' object