# Evidently Data Integrity Metrics

## Prepare Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_openml

# load "adult" dataset
data = fetch_openml(name='adult', version=2, as_frame='auto')
df = data.frame

# prepare target and prediction
df['target'] = df['education-num']
df['prediction'] = df['education-num'].values + np.random.normal(0, 6, df.shape[0])

# make reference and current datasets
reference_data = df[~df.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
current_data = df[df.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
current_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class,target,prediction
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K,9.0,7.059571
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K,10.0,16.675821
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K,10.0,7.909593
6,29.0,,227026.0,HS-grad,9.0,Never-married,,Unmarried,Black,Male,0.0,0.0,40.0,United-States,<=50K,9.0,7.688815
8,24.0,Private,369667.0,Some-college,10.0,Never-married,Other-service,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K,10.0,-1.385219


## Missing values metrics

Look at missing values in a column

In [2]:
from evidently.report import Report
from evidently.metrics import ColumnMissingValuesMetric

report = Report(metrics=[ColumnMissingValuesMetric(column_name="workclass", values=["Private"], replace=False)])
report.run(current_data=current_data, reference_data=reference_data)
report

In [3]:
report = Report(metrics=[ColumnMissingValuesMetric(column_name="workclass")])
report.run(current_data=current_data, reference_data=None)
report

Look at missing values in overall dataset

In [5]:
from evidently.report import Report
from evidently.metrics import DatasetMissingValuesMetric

report = Report(metrics=[DatasetMissingValuesMetric()])
report.run(current_data=current_data, reference_data=None)
report