# Data Drift: Code Practice

In [None]:
import pandas as pd
import numpy as np

from scipy.stats import mannwhitneyu
from sklearn import datasets

from evidently.calculations.stattests import StatTest
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DataDriftTable
from evidently.metric_preset import DataDriftPreset

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

## Prepare Datasets

In [None]:
#Dataset for Data Quality and Integrity
adult_data = datasets.fetch_openml(name='adult', version=2, as_frame='auto')
adult = adult_data.frame

adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]

adult_cur.iloc[:2000, 3:5] = np.nan

## Data Drift Options

**Notes**: 
You can specify drift detection method for features and/or model output

* all_features_stattest: Defines a custom drift detection method for all features
* num_features_stattest: Defines a custom drift detection method for numerical features only
* cat_features_stattest: Defines a custom drift detection method for categorical features only
* per_feature_stattest: Defines a custom drift detection method per feature

**Available methods**:  
* 'ks' 
* 'z' 
* 'chisquare' 
* 'jensenshannon' 
* 'kl_div' 
* 'psi' 
* 'wasserstein'
* 'anderson'
* 'fisher_exact'
* 't_test'
* 'cramer_von_mises'
* 'g_test'
* 'emperical_mmd'
* 'TVD'

You can implement a custom drift test and use it in parameters. Just define a function that takes two pd.Series (reference and current data) and returns a number (e.g. p_value or distance)

**Usage**:
- Report(metricss=[ColumnDrifMetric(column_name='name', stattest=custom_stattest)])

## Setting the stattest for the whole dataset

In [None]:
data_drift_share_report = Report(metrics=[
    DatasetDriftMetric()
])

data_drift_share_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_share_report.show(mode='inline')

In [None]:
data_drift_column_report = Report(metrics=[
    ColumnDriftMetric(column_name='education-num'),
    ColumnDriftMetric(column_name='education-num', stattest='psi')
])

data_drift_column_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_column_report.show(mode='inline')

In [None]:
data_drift_dataset_report = Report(metrics=[
    DataDriftTable(stattest='psi'),
])

data_drift_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_dataset_report.show(mode='inline')

## Setting drift detection method for numerical and categorical features

In [None]:
data_drift_dataset_report = Report(metrics=[
    DataDriftTable(num_stattest='psi', cat_stattest='jensenshannon'),
])

data_drift_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_dataset_report.show()

## Setting drift detection method  for individual features

In [None]:
per_column_stattest = {x: 'wasserstein' for x in ['age', 'education-num']}

for column in ['sex', 'class']:
    per_column_stattest[column] = 'z'

for column in ['workclass', 'education']:
    per_column_stattest[column] = 'kl_div'

for column in [ 'relationship', 'race',  'native-country']:
    per_column_stattest[column] = 'jensenshannon'

for column in ['fnlwgt','hours-per-week']:
    per_column_stattest[column] = 'anderson'

for column in ['capital-gain','capital-loss']:
    per_column_stattest[column] = 'cramer_von_mises'

In [None]:
per_column_stattest

In [None]:
data_drift_dataset_report = Report(metrics=[
    DataDriftTable(per_column_stattest=per_column_stattest),
])

data_drift_dataset_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_dataset_report.show(mode='inline')

## Custom drift detection method 

In [None]:
def _mann_whitney_u(reference_data: pd.Series, current_data: pd.Series, _feature_type: str, threshold: float):
    p_value = mannwhitneyu(np.array(reference_data), np.array(current_data))[1]
    return p_value, p_value < threshold

mann_whitney_stat_test = StatTest(
    name="mann-whitney-u",
    display_name="mann-whitney-u test",
    func=_mann_whitney_u,
    allowed_feature_types=["num"]
)

In [None]:
data_drift_report = Report(metrics=[
    DataDriftTable(num_stattest=mann_whitney_stat_test),
])

data_drift_report.run(reference_data=adult_ref, current_data=adult_cur)
data_drift_report.show(mode='inline')