In [None]:
try:
    import evidently
except:
    get_ipython().system('pip install git+https://github.com/evidentlyai/evidently.git')

In [None]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_openml

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

In [None]:
data = fetch_openml(name='adult', version=2, as_frame='auto')
df = data.frame
df.head()

Let's add two features to illustrate, that we choose stat test depending not just on its type, but also on a number of unique values.

Also, we will keep in mind that these features are absolutely random, so we don't expect any drift here.

In [None]:
df['num_feature_with_3_values'] = np.random.choice(3, df.shape[0])
df['num_feature_with_2_values'] = np.random.choice(2, df.shape[0])

In [None]:
numerical_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'num_feature_with_3_values', 'num_feature_with_2_values']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'class']
column_mapping = ColumnMapping(numerical_features=numerical_features, categorical_features=categorical_features)

## small dataset

### no difference

We created 2 small random samples, so we do not expect to see any drift here.

In [None]:
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

data_drift_report.run(
    reference_data=df.sample(1000, random_state=0), 
    current_data=df.sample(1000, random_state=10), 
    column_mapping=column_mapping
)
data_drift_report

When you're working with small datasets, it's more likely that you'll get different distributions by chance. But it can also be concluded that statistical tests are quite sensitive.

### data shifted

We split data in 2 samples by relationship status, so we do expect to see some drift here.

In [None]:
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

data_drift_report.run(
    reference_data=df[df.relationship.isin(['Husband', 'Wife'])].sample(1000, random_state=0), 
    current_data=df[~df.relationship.isin(['Husband', 'Wife'])].sample(1000, random_state=10), 
    column_mapping=column_mapping
)
data_drift_report

## big dataset

### no difference

We created 2 small random samples, so we do not expect to see any drift here.

In [None]:
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

data_drift_report.run(
    reference_data=df.sample(30000, random_state=0), 
    current_data=df.sample(30000, random_state=10), 
    column_mapping=column_mapping
)
data_drift_report

### data shifted

We split data in 2 samples by relationship status, so we do expect to see some drift here.

In [None]:
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])
data_drift_report.run(
    reference_data=df[df.relationship.isin(['Husband', 'Wife'])].sample(30000, random_state=0, replace=True), 
    current_data=df[~df.relationship.isin(['Husband', 'Wife'])].sample(30000, random_state=10, replace=True), 
    column_mapping=column_mapping
)
data_drift_report