# Using  Evidently to Evaluate Data Drift for Dataset

In [17]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [18]:
import json
import pandas as pd

from evidently.model_profile import Profile
from evidently.profile_sections import DataDriftProfileSection

This notebook shows how you can use Evidently to check the data for data drift.

Acknowledgments:

The dataset used in the example is from: https://www.kaggle.com/c/bike-sharing-demand/data?select=train.csv
Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg
More information about the dataset can be found in UCI machine learning repository: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

In [19]:
#load data
raw_data = pd.read_csv('bike_demand_prediction_data.csv', header=0, 
                       sep=',', parse_dates=['datetime'], index_col='datetime')

In [20]:
#observe data structure
raw_data.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [21]:
#set column mapping for Evidently Profile
data_columns = {}
data_columns['numerical_features'] = ['weather', 'temp', 'atemp', 'humidity', 'windspeed']
data_columns['categorical_features'] = ['holiday', 'workingday']

In [22]:
#evaluate data drift with Evidently Profile
def detect_dataset_drift(reference, production, column_mapping, confidence=0.95, threshold=0.5, get_ratio=False):
    """
    Returns True if Data Drift is detected, else returns False.
    If get_ratio is True, returns ration of drifted features.
    The Data Drift detection depends on the confidence level and the threshold.
    For each individual feature Data Drift is detected with the selected confidence (default value is 0.95).
    Data Drift for the dataset is detected if share of the drifted features is above the selected threshold (default value is 0.5).
    """
    
    data_drift_profile = Profile(sections=[DataDriftProfileSection])
    data_drift_profile.calculate(reference, production, column_mapping=column_mapping)
    report = data_drift_profile.json()
    json_report = json.loads(report)

    drifts = []
    num_features = column_mapping.get('numerical_features') if column_mapping.get('numerical_features') else []
    cat_features = column_mapping.get('categorical_features') if column_mapping.get('categorical_features') else []
    for feature in num_features + cat_features:
        drifts.append(json_report['data_drift']['data']['metrics'][feature]['p_value']) 
        
    n_features = len(drifts)
    n_drifted_features = sum([1 if x<(1. - confidence) else 0 for x in drifts])
    if get_ratio:
        return n_drifted_features/n_features
    else:
        return True if n_drifted_features/n_features >= threshold else False

In [23]:
#February to March drift
detect_dataset_drift(raw_data.loc['2011-02-01 00:00:00':'2011-02-28 23:00:00'], 
             raw_data.loc['2011-03-01 00:00:00':'2011-03-31 23:00:00'], 
             column_mapping=data_columns,
             threshold=0.8
            )

False

In [24]:
#Aprin to May drift
detect_dataset_drift(raw_data.loc['2011-04-01 00:00:00':'2011-04-30 23:00:00'], 
             raw_data.loc['2011-05-01 00:00:00':'2011-05-31 23:00:00'], 
             column_mapping=data_columns,
             threshold=0.8
            )

True

In [25]:
#June to July drift
detect_dataset_drift(raw_data.loc['2011-06-01 00:00:00':'2011-06-30 23:00:00'], 
             raw_data.loc['2011-07-01 00:00:00':'2011-07-31 23:00:00'], 
             column_mapping=data_columns,
             threshold=0.8
            )

False