# Report Concept Draft

In [1]:
try:
    import evidently
except:
    !npm install -g yarn
    !pip install git+https://github.com/evidentlyai/evidently.git

In [6]:
import pandas as pd
import numpy as np

from sklearn import datasets, ensemble, model_selection

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import (DataDrift, DataQuality, CatTargetDrift, NumTargetDrift, 
                                     RegressionPerformance, ClassificationPerformance)

## Prepare Datasets

In [3]:
#Dataset for Data Quality and Integrity
adult_data = datasets.fetch_openml(name='adult', version=2, as_frame='auto')
adult = adult_data.frame

adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]

adult_cur.iloc[:2000, 3:5] = np.nan

In [4]:
#Dataset for regression
housing_data = datasets.fetch_california_housing(as_frame='auto')
housing = housing_data.frame

housing.rename(columns={'MedHouseVal': 'target'}, inplace=True)
housing['prediction'] = housing_data['target'].values + np.random.normal(0, 3, housing.shape[0])

housing_ref = housing.sample(n=5000, replace=False)
housing_cur = housing.sample(n=5000, replace=False)

In [5]:
#Dataset for binary probabilistic classifcation
bcancer_data = datasets.load_breast_cancer(as_frame='auto')
bcancer = bcancer_data.frame

bcancer_ref = bcancer.sample(n=300, replace=False)
bcancer_cur = bcancer.sample(n=200, replace=False)

model = ensemble.RandomForestClassifier(random_state=1, n_estimators=10)
model.fit(bcancer_ref[bcancer_data.feature_names.tolist()], bcancer_ref.target)

bcancer_ref['prediction'] = model.predict_proba(bcancer_ref[bcancer_data.feature_names.tolist()])[:, 1]
bcancer_cur['prediction'] = model.predict_proba(bcancer_cur[bcancer_data.feature_names.tolist()])[:, 1]

In [24]:
#Dataset for multiclass classifcation
iris_data = datasets.load_iris(as_frame='auto')
iris = iris_data.frame

iris_ref = iris.sample(n=75, replace=False)
iris_cur = iris.sample(n=75, replace=False)

model = ensemble.RandomForestClassifier(random_state=1, n_estimators=3)
model.fit(iris_ref[iris_data.feature_names], iris_ref.target)

iris_ref['prediction'] = model.predict(iris_ref[iris_data.feature_names])
iris_cur['prediction'] = model.predict(iris_cur[iris_data.feature_names])

## How to create a visual Report (ex. Dashboard)?

In [7]:
data_drift_report = Report(metrics=[
    DataDrift(),
])

data_drift_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_report

In [11]:
#data_drift_report.save_html('data_drift_report.html')

## How to create a json Report (ex. Profile)?

In [8]:
data_drift_report.json()

'{"timestamp": "2022-09-06 11:20:05.723394", "metrics": {"DataDriftMetrics": {"n_features": 15, "n_drifted_features": 5, "share_drifted_features": 0.3333333333333333, "dataset_drift": false, "features": {"age": {"current_small_hist": [[0.02471021672878118, 0.025839691234843417, 0.0262859521410848, 0.025211766596857754, 0.015942967066340047, 0.010173168977679455, 0.0061528716099474344, 0.0018640278561586543, 0.000568686464590777, 0.0002369526935794904], [17.0, 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90.0]], "ref_small_hist": [[0.02104876054252575, 0.020739077628796638, 0.02384558435714183, 0.026835959992838568, 0.018658395552179158, 0.012580868370245284, 0.00869047676652328, 0.0029516652714806184, 0.001287119610186633, 0.000348393277945254], [17.0, 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90.0]], "feature_type": "num", "stattest_name": "Wasserstein distance (normed)", "p_value": 0.18534692319042428, "threshold": 0.1, "drift_detected": true}, "capital-gain": {"

In [10]:
#data_drift_report.save_json('data_drift_report.json')

## How to get a python object with Report's main data?

In [12]:
data_drift_report.as_dict()

{'timestamp': '2022-09-06 11:20:52.809535',
 'metrics': {'DataDriftMetrics': {'n_features': 15,
   'n_drifted_features': 5,
   'share_drifted_features': 0.3333333333333333,
   'dataset_drift': False,
   'features': {'age': {'current_small_hist': [[0.02471021672878118,
       0.025839691234843417,
       0.0262859521410848,
       0.025211766596857754,
       0.015942967066340047,
       0.010173168977679455,
       0.0061528716099474344,
       0.0018640278561586543,
       0.000568686464590777,
       0.0002369526935794904],
      [17.0, 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90.0]],
     'ref_small_hist': [[0.02104876054252575,
       0.020739077628796638,
       0.02384558435714183,
       0.026835959992838568,
       0.018658395552179158,
       0.012580868370245284,
       0.00869047676652328,
       0.0029516652714806184,
       0.001287119610186633,
       0.000348393277945254],
      [17.0, 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90.0]],
     'featu

## What Reports are avaliable?

In [13]:
data_quality_report = Report(metrics=[
    DataQuality(),
])

data_quality_report.run(reference_data=adult_ref, current_data=adult_cur)
data_quality_report

In [15]:
regression_performance_report = Report(metrics=[
    RegressionPerformance(),
])

regression_performance_report.run(reference_data=housing_ref.sort_index(), current_data=housing_cur.sort_index())
regression_performance_report

In [17]:
num_target_drift_report = Report(metrics=[
    NumTargetDrift(),
])

num_target_drift_report.run(reference_data=housing_ref, current_data=housing_cur)
num_target_drift_report

In [16]:
classification_performance_report = Report(metrics=[
    ClassificationPerformance(),
])

classification_performance_report.run(reference_data=bcancer_ref, current_data=bcancer_cur)
classification_performance_report

In [29]:
cat_target_drift_report = Report(metrics=[
    CatTargetDrift(),
])

cat_target_drift_report.run(reference_data=iris_ref, current_data=iris_cur)
cat_target_drift_report

KeyError: 'p'