# How to use column mapping?

## Sections:

1)  Primary Mapping Options]
    
    1.1) Dataset structure

    1.2) Categorical and numerical features

2)  Additional Mapping Options

    2.1) Datetime and Datetime features

    2.2) Task parameter for target function
    
    2.3) Prediction column(s) in classification

### Import Libraries

In [207]:
import pandas as pd
import numpy as np

from sklearn import datasets, ensemble, model_selection
import requests
import zipfile
import io

from datetime import datetime, time

In [208]:
try:
    import evidently
except:
    !pip install git+https://github.com/evidentlyai/evidently.git

In [209]:
from evidently import ColumnMapping
from evidently.report import Report

from evidently.metric_preset import DataDriftPreset,DataQualityPreset,RegressionPreset,TargetDriftPreset
from evidently.test_suite import TestSuite
from evidently.metrics import *
from evidently.tests import *


import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

### Import Datasets

In [210]:
#Dataset for regression
housing_data = datasets.fetch_california_housing(as_frame='auto')
housing = housing_data.frame

#housing.rename(columns={'MedHouseVal': 'target'}, inplace=True)
housing['pred'] = housing['MedHouseVal'].values + np.random.normal(0, 3, housing.shape[0])

housing_ref = housing.sample(n=5000, replace=False)
housing_cur = housing.sample(n=5000, replace=False)

In [211]:
#Timeseries dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
with zipfile.ZipFile(io.BytesIO(requests.get(url).content)) as arc:
    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')

raw_data.index = raw_data.index + pd.to_timedelta(raw_data['hr'], unit='h')
raw_data['mnth'] = raw_data.index.to_period('M').astype('datetime64[M]')
raw_data['hr'] = raw_data.index.floor('h')
raw_data['weekday'] = raw_data.index.floor('d')
raw_data['date'] = raw_data.index

reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

In [212]:
#Dataset for Binary and Binary Probabilistic Classification
bcancer_data = datasets.load_breast_cancer(as_frame='auto')
bcancer = bcancer_data.frame

bcancer_ref = bcancer.sample(n=300, replace=False)
bcancer_cur = bcancer.sample(n=200, replace=False)

bcancer_label_ref = bcancer_ref.copy(deep=True)
bcancer_label_cur = bcancer_cur.copy(deep=True)

model = ensemble.RandomForestClassifier(random_state=1, n_estimators=10)
model.fit(bcancer_ref[bcancer_data.feature_names.tolist()], bcancer_ref.target)

bcancer_ref['prediction'] = model.predict_proba(bcancer_ref[bcancer_data.feature_names.tolist()])[:, 1]
bcancer_cur['prediction'] = model.predict_proba(bcancer_cur[bcancer_data.feature_names.tolist()])[:, 1]

bcancer_label_ref['prediction'] = model.predict(bcancer_label_ref[bcancer_data.feature_names.tolist()])
bcancer_label_cur['prediction'] = model.predict(bcancer_label_cur[bcancer_data.feature_names.tolist()])

In [213]:
#Dataset for Multiclass Classification
iris_data = datasets.load_iris(as_frame='auto')
iris = iris_data.frame

iris_ref = iris.sample(n=150, replace=False)
iris_cur = iris.sample(n=150, replace=False)

model = ensemble.RandomForestClassifier(random_state=1, n_estimators=3)
model.fit(iris_ref[iris_data.feature_names], iris_ref.target)

iris_ref['pred'] = model.predict(iris_ref[iris_data.feature_names])
iris_cur['pred'] = model.predict(iris_cur[iris_data.feature_names])

In [214]:
#Multiclass Probabilistic Classification
iris_ref_prob = model.predict_proba(iris_ref[iris_data.feature_names])
iris_cur_prob = model.predict_proba(iris_cur[iris_data.feature_names])

# create a list of target classes for the columns
target_classes = ['Setosa', 'Versicolor', 'Virginica']

# add the probabilities to a new dataframe with the target classes as columns
iris_ref_prob_df = pd.DataFrame(iris_ref_prob, columns=target_classes)
iris_cur_prob_df = pd.DataFrame(iris_cur_prob, columns=target_classes)


# add the predicted classes and actual target names to the dataframes for comparison
iris_ref_prob_df['pred'] = model.predict(iris_ref[iris_data.feature_names])
iris_ref_prob_df['target'] = iris_ref.target.apply(lambda x: iris_data.target_names[x].capitalize())
iris_cur_prob_df['pred'] = model.predict(iris_cur[iris_data.feature_names])
iris_cur_prob_df['target'] = iris_cur.target.apply(lambda x: iris_data.target_names[x].capitalize())


## Primary Mapping Options

#### Dataset structure
You can create a ColumnMapping object to map your column names and feature types. Let's see an example of data drift report for a regression problem.

In [215]:
column_mapping = ColumnMapping()

column_mapping.target = 'MedHouseVal' #'MedHouseVal' is the name of the column with the target function
column_mapping.prediction = 'pred' #'pred' is the name of the column(s) with model predictions

column_mapping.id = None #there is no ID column in the dataset
column_mapping.datetime = None #there is no 'date' column the dataset

In [216]:
regression_performance_report = Report(metrics=[
    RegressionPreset()
])

regression_performance_report.run(reference_data=housing_ref.sort_index(), current_data=housing_cur.sort_index(),column_mapping=column_mapping)
regression_performance_report

#### Categorical and numerical features
To split the features into numerical and categorical types:

In [217]:
column_mapping.numerical_features = ['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude']  #list of numerical features
column_mapping.categorical_features = None # There are no categorical features in the dataset

In [218]:
data_quality_report = Report(metrics=[
    DataQualityPreset()
])

data_quality_report.run(reference_data=housing_ref.sort_index(), current_data=housing_cur.sort_index(),column_mapping=column_mapping)
data_quality_report


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



## Additional Mapping Options

* Datetime and Datetime features
* Task parameter for target function
* Prediction column(s) in classification

### Datetime and Datetime features

*Default*: Evidently treats columns with DateTime format (np.datetime64) as DateTime features.

*Note*: do not confuse DateTime features with the DateTime column, which is used as the x-axis in some plots. You will typically use the DateTime column as a prediction timestamp.

*Why map them*: if you specify DateTime features, they will be ignored in data drift calculation. Evidently will also calculate appropriate statistics and generate different visualizations for DateTime features in the data quality report.

In [219]:
column_mapping = ColumnMapping()

column_mapping.datetime_features = ['weekday','hr','mnth']
column_mapping.datetime = 'date'
column_mapping.numerical_features = ['temp', 'atemp']
column_mapping.categorical_features = ['holiday']
column_mapping.target = 'cnt'

In [220]:
data_drift = Report(metrics = [DataDriftPreset()])
data_drift.run(current_data = current,
               reference_data = reference,
               column_mapping=column_mapping)

data_drift

In [221]:
data_quality_report = Report(metrics=[DataQualityPreset()])
data_quality_report.run(reference_data=reference, 
                        current_data= current,
                        column_mapping=column_mapping)

data_quality_report


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



### Task parameter for target function

In many cases, it is important to differentiate between continuous and discrete targets. This applies to multiple reports and tests, including Data Quality and Target Drift.

*Default*: If you don't specify the task, Evidently will use a simple strategy: if the target has a numeric type and the number of unique values > 5: task == ‘regression.’ In all other cases, the task == ‘classification’.

This was the case in the notebook's first example of regression.

*Why map it*: If you have a multi-class problem where classes are encoded as numbers, it might look the same way as a regression problem. Thus it is best to explicitly specify it. It will affect how the target (prediction) is visualized and help pick the correct statistical tests for the target (prediction) drift detection. It will also affect the calculation of statistics and tests that differ for numerical and categorical data types.

In [222]:
iris_ref.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target', 'pred'],
      dtype='object')

In [223]:

column_mapping = ColumnMapping()
column_mapping.target = 'target'
column_mapping.prediction = 'pred' 
column_mapping.numerical_features = ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']

#column_mapping.task accepts two values: 'regression' and 'classification'
column_mapping.task = 'classification'

Now that the task is specified when we set the TargetDriftPreset tests for categorical and numerical columns, the appropriate tests will be applied in each type of column.

Specifically, the numerical features ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)'] will be tested with ks test and the categorical features ['target','pred'] will be tested with psi test. 

In [224]:
multiclass_cat_target_drift_report = Report(metrics=[
    TargetDriftPreset(num_stattest='ks', cat_stattest='psi'),
])

multiclass_cat_target_drift_report.run(reference_data=iris_ref, current_data=iris_cur,column_mapping=column_mapping)
multiclass_cat_target_drift_report

### Prediction column(s) in classification

To evaluate the classification performance, you need both true labels and prediction. Depending on the classification type (e.g., binary, multi-class, probabilistic), you have different options of how to pass the predictions.

Multiclass classification

Option 1

Target: encoded labels

Preds: encoded labels + Optional[target_names].

In [225]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'pred'
column_mapping.target_names = ['Setosa', 'Versicolor', 'Virginica']
column_mapping.task = 'classification'

In [226]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass(),
])

classification_report.run(reference_data=iris_ref, current_data=iris_cur,column_mapping=column_mapping)
classification_report

Multiclass classification

Option 2

Target: labels

Preds: labels

In [227]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'pred'
column_mapping.task = 'classification'

In [228]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass(),
])

classification_report.run(reference_data=iris_ref, current_data=iris_cur,column_mapping=column_mapping)
classification_report

Multiclass probabilistic classification

Target: labels

Preds: columns named after labels.

In [229]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = ['Setosa', 'Versicolor', 'Virginica']

In [230]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass(),
])

classification_report.run(reference_data=iris_ref_prob_df, current_data=iris_cur_prob_df,column_mapping=column_mapping)
classification_report

Binary classification

Option 1

Target: encoded labels

Preds: encoded labels + pos_label + Optional[target_names]

In [231]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
column_mapping.target_names = ['Malignant', 'Benign']
pos_label = 'Malignant'

In [232]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass()
])

classification_report.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur,column_mapping=column_mapping)
classification_report

In [233]:
prob_classification_performance_dataset_tests = TestSuite(tests=[
    TestAccuracyScore(),
    TestPrecisionScore(),
    TestRecallScore(),
    TestF1Score()

])

prob_classification_performance_dataset_tests.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur,column_mapping=column_mapping)
prob_classification_performance_dataset_tests

Binary classification

Option 2

Target: labels

Preds: labels + pos_label

In [234]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
pos_label = '1'

In [235]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass()
])

classification_report.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur,column_mapping=column_mapping)
classification_report

In [236]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
column_mapping.target_names = ['Malignant', 'Benign']
pos_label = 'Malignant'

Binary probabilistic classification

Option 3

Target: encoded labels

Preds: one column with any name + pos_label

In [237]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
pos_label = 1
column_mapping.target_names = ['Malignant', 'Benign']


In [238]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass()
])

classification_report.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur,column_mapping=column_mapping)
classification_report