# Evidently AI - Model Monitoring and Data Quality Evaluation

The notebook uses Evidently AI [open source package](https://github.com/evidentlyai/evidently) to evaluate the data quality, model performance, and monitor predictions for drift.

The datasets in this notebook assume you have already pushed the diabetes training dataset, test dataset, and predictions into Snowflake. If not, you will need to substitute your own datasets into the notebook.

## 1. Setup Environment

In [1]:
# Snowpark for Python
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import Variant
from snowflake.snowpark.version import VERSION

# Snowpark ML
# Misc
import pandas as pd
import json
import logging 
logger = logging.getLogger("snowflake.snowpark.session")
logger.setLevel(logging.ERROR)

from snowflake import connector
from snowflake.ml.utils import connection_params

In [2]:
#Evidently imports
from evidently import ColumnMapping
from evidently.test_suite import TestSuite

from evidently.test_preset import NoTargetPerformanceTestPreset
from evidently.test_preset import DataQualityTestPreset
from evidently.test_preset import DataStabilityTestPreset
from evidently.test_preset import DataDriftTestPreset
from evidently.test_preset import RegressionTestPreset
from evidently.test_preset import MulticlassClassificationTestPreset
from evidently.test_preset import BinaryClassificationTopKTestPreset
from evidently.test_preset import BinaryClassificationTestPreset

In [4]:
with open('../../creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    PASSWORD = data['password']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']

CONNECTION_PARAMETERS = {
   "account": SF_ACCOUNT,
   "user": USERNAME,
   "password": PASSWORD,
}

session = Session.builder.configs(CONNECTION_PARAMETERS).create()

In [5]:
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

User                        : RSHAH
Role                        : "RAJIV"
Database                    : "RAJIV"
Schema                      : "DOCAI"
Warehouse                   : "RAJIV"
Snowflake version           : 8.18.0
Snowpark for Python version : 1.11.1


## 2. Get Data

In [6]:
train_df= session.table("DIAB_TRAIN")
test_df = session.table("DIAB_TEST")

##if your dataset is too big, feel free to sample down

In [7]:
train = train_df.toPandas()
test = test_df.toPandas()

In [8]:
test.shape

(20436, 32)

In [9]:
test.head()

Unnamed: 0,RACE,GENDER,AGE,ADMISSION_TYPE_ID,DISCHARGE_DISPOSITION_ID,ADMISSION_SOURCE_ID,TIME_IN_HOSPITAL,MEDICAL_SPECIALTY,NUM_LAB_PROCEDURES,NUM_PROCEDURES,...,TROGLITAZONE,EXAMIDE,CITOGLIPTON,GLIPIZIDE-METFORMIN,GLIMEPIRIDE-PIOGLITAZONE,METFORMIN-ROSIGLITAZONE,METFORMIN-PIOGLITAZONE,CHANGE,DIABETESMED,READMITTED
0,Caucasian,Male,70+,Emergency,Discharged to Home,Emergency,5,,73,0,...,No,No,No,No,No,No,No,0,1,1
1,AfricanAmerican,Female,[20-50),Emergency,Discharged to Home,Emergency,9,,47,2,...,No,No,No,No,No,No,No,0,1,1
2,Caucasian,Male,70+,Emergency,Discharged to Home,Emergency,10,Family/GeneralPractice,55,1,...,No,No,No,No,No,No,No,0,1,0
3,AfricanAmerican,Male,[50-70),Emergency,Discharged to Home,Emergency,4,,45,4,...,No,No,No,No,No,No,No,1,1,1
4,Other,Male,70+,Elective,Discharged to Home,Referral,6,,42,2,...,No,No,No,No,No,No,No,1,1,0


## 3. Data Quality and Data Stability of the Training Dataset

In [10]:
from evidently import ColumnMapping
from evidently.test_suite import TestSuite

from evidently.test_preset import NoTargetPerformanceTestPreset
from evidently.test_preset import DataQualityTestPreset
from evidently.test_preset import DataStabilityTestPreset

In [11]:
data_stability = TestSuite(tests=[
    DataStabilityTestPreset(),
])

data_stability.run(reference_data=test, current_data=test)
data_stability

In [12]:
data_stability.as_dict()

{'tests': [{'name': 'Number of Rows',
   'description': 'The number of rows is 20436. The test threshold is eq=2.04e+04 ± 2.04e+03.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'eq': {'value': 20436,
      'relative': 0.1,
      'absolute': 1e-12}},
    'value': 20436.0}},
  {'name': 'Number of Columns',
   'description': 'The number of columns is 32. The test threshold is eq=32.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'eq': 32}, 'value': 32.0}},
  {'name': 'Column Types',
   'description': 'The number of columns with a type mismatch is 0 out of 32.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'columns': [{'actual_type': 'object_',
      'column_name': 'RACE',
      'expected_type': 'object_'},
     {'actual_type': 'object_',
      'column_name': 'GENDER',
      'expected_type': 'object_'},
     {'actual_type': 'object_',
      'column_name': 'AGE',
      'expected_ty

In [13]:
data_quality = TestSuite(tests=[
    DataQualityTestPreset(),
])

data_quality.run(reference_data=test, current_data=test)
data_quality

In [14]:
data_quality.as_dict()

{'tests': [{'name': 'The Share of Missing Values in a Column',
   'description': 'The share of missing values in the column **TIME_IN_HOSPITAL** is 0. The test threshold is lte=0 ± 1e-12.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'lte': {'value': 0.0,
      'relative': 0.1,
      'absolute': 1e-12}},
    'value': 0.0,
    'column_name': 'TIME_IN_HOSPITAL'}},
  {'name': 'The Share of Missing Values in a Column',
   'description': 'The share of missing values in the column **NUM_MEDICATIONS** is 0. The test threshold is lte=0 ± 1e-12.',
   'status': 'SUCCESS',
   'group': 'data_integrity',
   'parameters': {'condition': {'lte': {'value': 0.0,
      'relative': 0.1,
      'absolute': 1e-12}},
    'value': 0.0,
    'column_name': 'NUM_MEDICATIONS'}},
  {'name': 'The Share of Missing Values in a Column',
   'description': 'The share of missing values in the column **NUMBER_EMERGENCY** is 0. The test threshold is lte=0 ± 1e-12.',
   'status': 'SU

## 4. Evaluate Model Performance

In [15]:
pred_df = session.table("DIABETES_TEST_SCORED")
preds = pred_df.toPandas()
preds['ACTUAL'] = test['READMITTED']

In [16]:
print(preds.shape)
preds.head()

(20436, 34)


Unnamed: 0,ACETOHEXAMIDE,TROGLITAZONE,DIAG_1,NUMBER_EMERGENCY,NUMBER_OUTPATIENT,DIAG_3,DISCHARGE_DISPOSITION_ID,METFORMIN-ROSIGLITAZONE,A1CRESULT,TOLBUTAMIDE,...,CHANGE,GLIMEPIRIDE-PIOGLITAZONE,ADMISSION_TYPE_ID,GLIPIZIDE-METFORMIN,DIABETESMED,NUM_MEDICATIONS,NUMBER_DIAGNOSES,AGE,READMITTED_PRED,ACTUAL
0,No,No,Circulatory,0,0,Genitourinary,Discharged to Home,No,,No,...,0,No,Emergency,No,0,12,8,70+,0,1
1,No,No,Digestive,0,0,Diabetes,Discharged to Home,No,,No,...,0,No,Emergency,No,1,11,5,[20-50),0,1
2,No,No,Diabetes,0,0,Circulatory,,No,>8,No,...,1,No,,No,1,14,9,70+,0,0
3,No,No,Circulatory,0,0,Other,,No,,No,...,0,No,,No,0,10,6,70+,0,1
4,No,No,Injury,0,0,Diabetes,,No,Norm,No,...,0,No,,No,0,16,8,70+,0,0


In [17]:
from evidently.report import Report
from evidently.metrics import ClassificationClassSeparationPlot
from evidently.metrics import ClassificationConfusionMatrix
from evidently.metrics import ClassificationPRCurve
from evidently.metrics import ClassificationPRTable
from evidently.metrics import ClassificationQualityByClass
from evidently.metrics import ClassificationQualityByFeatureTable
from evidently.metrics import ClassificationQualityMetric

In [18]:
column_mapping = ColumnMapping()
column_mapping.prediction = 'READMITTED_PRED'
column_mapping.target = 'ACTUAL'
column_mapping.task = 'classification'   
classification_report = Report(metrics=[
        ClassificationConfusionMatrix(),
        ClassificationQualityByClass(),
        ClassificationQualityMetric(),
    ])

classification_report.run(reference_data=None, current_data=preds, column_mapping=column_mapping)
#classification_report.save_html("report.html")
classification_report.as_dataframe()

{'ClassificationConfusionMatrix':   current_matrix_labels         current_matrix_values reference_matrix  \
 0                [0, 1]  [[7146, 3908], [6124, 3258]]             None   
 
   target_names                      metric_id  \
 0         None  ClassificationConfusionMatrix   
 
                         metric_hash  
 0  6b9552f94ac99c1f01adf64bbcf744cd  ,
 'ClassificationQualityByClass':    precision    recall        f1  support  class_value  dataset  \
 0   0.538508  0.646463  0.587568  11054.0            0  current   
 0   0.454647  0.347261  0.393764   9382.0            1  current   
 
                       metric_id                       metric_hash  
 0  ClassificationQualityByClass  2bf82bb79fc7a35640c4005d95b5b082  
 0  ClassificationQualityByClass  2bf82bb79fc7a35640c4005d95b5b082  ,
 'ClassificationQualityMetric':    current_accuracy  current_precision  current_recall  current_f1  \
 0          0.509102           0.454647        0.347261    0.393764   
 
   current_ro

## 5. Monitor Model Drift

In [19]:
##I need some new data for analyzing drift, here I just sample for demo purposes
driftdata = preds.sample(frac=0.1)

In [20]:
column_mapping = ColumnMapping()
column_mapping.prediction = 'READMITTED_PRED'
column_mapping.target = 'ACTUAL'
column_mapping.task = 'classification'


label_binary_classification_performance = TestSuite(tests=[
    BinaryClassificationTestPreset(),
])

label_binary_classification_performance.run(reference_data=preds, current_data=driftdata, column_mapping=column_mapping)
label_binary_classification_performance

In [21]:
label_binary_classification_performance.as_dict()

{'tests': [{'name': 'Drift per Column',
   'description': 'The drift score for the feature **ACTUAL** is 0.009. The drift detection method is Jensen-Shannon distance. The drift detection threshold is 0.1.',
   'status': 'SUCCESS',
   'group': 'data_drift',
   'parameters': {'stattest': 'Jensen-Shannon distance',
    'score': 0.009,
    'threshold': 0.1,
    'detected': False,
    'column_name': 'ACTUAL'}},
  {'name': 'Precision Score',
   'description': 'The Precision Score is 0.465. The test threshold is eq=0.455 ± 0.0909',
   'status': 'SUCCESS',
   'group': 'classification',
   'parameters': {'condition': {'eq': {'value': 0.4546469439017583,
      'relative': 0.2,
      'absolute': 1e-12}},
    'value': 0.4647058823529412}},
  {'name': 'Recall Score',
   'description': 'The Recall Score is 0.328. The test threshold is eq=0.347 ± 0.0695',
   'status': 'SUCCESS',
   'group': 'classification',
   'parameters': {'condition': {'eq': {'value': 0.3472607120017054,
      'relative': 0.2,
  