# How to use column mapping?

Import Libraries

Import Datasets

Column Mapping

1. Target, prediction, ID, datetime
2. Categorical and numerical features
3. Datetime features
4. Task parameter for target function
5. Prediction column(s) in classification
   * 5.1. Multi-class, option 1
   * 5.2. Multi-class, option 2
   * 5.3. Multi-class probabilistic classification
   * 5.4. Binary, option 1
   * 5.5. Binary, option 2
   * 5.6. Binary probabilistic classification, option 1
   * 5.7. Binary probabilistic classification, option 2
   * 5.8. Binary probabilistic classification, option 3
6. Text features
7. Embedding


# Import Libraries

In [11]:
import pandas as pd
import numpy as np

from sklearn import datasets, ensemble, model_selection
import requests
import zipfile
import io

from datetime import datetime, time

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [12]:
try:
    import evidently
except:
    !pip install git+https://github.com/evidentlyai/evidently.git

#you might need to install pillow library to use datasets.fetch_lfw_people() from sklearn
try:
    import PIL
except ImportError:
    !pip install pillow

In [151]:
from evidently import ColumnMapping
from evidently.report import Report

from evidently.metric_preset import DataDriftPreset,DataQualityPreset,TargetDriftPreset
from evidently.test_preset import RegressionTestPreset
from evidently.test_suite import TestSuite
from evidently.metrics import *
from evidently.tests import *


# Import Datasets

In [14]:
#Dataset for regression
housing_data = datasets.fetch_california_housing(as_frame='auto')
housing = housing_data.frame

housing['pred'] = housing['MedHouseVal'].values + np.random.normal(0, 3, housing.shape[0])

housing_ref = housing.sample(n=5000, replace=False)
housing_cur = housing.sample(n=5000, replace=False)

In [15]:
#Timeseries dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
with zipfile.ZipFile(io.BytesIO(requests.get(url).content)) as arc:
    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')

raw_data.index = raw_data.index + pd.to_timedelta(raw_data['hr'], unit='h')
raw_data['mnth'] = raw_data.index.to_period('M').astype('datetime64[M]')
raw_data['hr'] = raw_data.index.floor('h')
raw_data['weekday'] = raw_data.index.floor('d')
raw_data['date'] = raw_data.index

bike_reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
bike_current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

In [35]:
#Dataset for Binary and Binary Probabilistic Classification
bcancer_data = datasets.load_breast_cancer(as_frame='auto')
bcancer = bcancer_data.frame

bcancer_ref = bcancer.sample(n=300, replace=False)
bcancer_cur = bcancer.sample(n=200, replace=False)

bcancer_label_ref = bcancer_ref.copy(deep=True)
bcancer_label_cur = bcancer_cur.copy(deep=True)

target_classes = ['Benign','Malignant']

model = ensemble.RandomForestClassifier(random_state=1, n_estimators=10)
model.fit(bcancer_ref[bcancer_data.feature_names.tolist()], bcancer_ref.target)

#Reference and current data for Binary classification, option 1 and 2
bcancer_label_ref['prediction'] = model.predict(bcancer_label_ref[bcancer_data.feature_names.tolist()])
bcancer_label_cur['prediction'] = model.predict(bcancer_label_cur[bcancer_data.feature_names.tolist()])

#Reference and current data for Binary probabilistic classification, option 3
bcancer_ref['prediction'] = model.predict_proba(bcancer_ref[bcancer_data.feature_names.tolist()])[:, 1]
bcancer_cur['prediction'] = model.predict_proba(bcancer_cur[bcancer_data.feature_names.tolist()])[:, 1]

#Reference and current data for Binary probabilistic classification, option 1 and 2
bcancer_ref12 = bcancer_ref.copy()
bcancer_cur12 = bcancer_cur.copy()

bcancer_ref12 = bcancer_ref12.rename({'prediction': 'Malignant'}, axis=1)
bcancer_cur12 = bcancer_cur12.rename({'prediction': 'Malignant'}, axis=1)

bcancer_ref12['Benign'] = bcancer_ref12.Malignant.apply(lambda x: 1-x)
bcancer_cur12['Benign'] = bcancer_cur12.Malignant.apply(lambda x: 1-x)

bcancer_ref12['target'] = bcancer_ref12.target.apply(lambda x: target_classes[x])
bcancer_cur12['target'] = bcancer_cur12.target.apply(lambda x: target_classes[x])


In [123]:
#Dataset for Multiclass Classification
iris_data = datasets.load_iris(as_frame='auto')
iris = iris_data.frame

iris_ref = iris.sample(n=150, replace=False)
iris_cur = iris.sample(n=150, replace=False)

model = ensemble.RandomForestClassifier(random_state=1, n_estimators=3)
model.fit(iris_ref[iris_data.feature_names], iris_ref.target)

#Reference and current data for Multiclass classification, option 1
iris_ref['pred'] = model.predict(iris_ref[iris_data.feature_names])
iris_cur['pred'] = model.predict(iris_cur[iris_data.feature_names])

#Reference and current data for Multiclass classification, option 2
target_classes = ['Setosa', 'Versicolor', 'Virginica']

iris_ref2 = iris_ref.copy()
iris_cur2 = iris_cur.copy()

iris_ref2['pred'] = iris_ref.pred.apply(lambda x: iris_data.target_names[x].capitalize())
iris_ref2['target'] = iris_ref.target.apply(lambda x: iris_data.target_names[x].capitalize())
iris_cur2['pred'] = iris_cur.pred.apply(lambda x: iris_data.target_names[x].capitalize())
iris_cur2['target'] = iris_cur.target.apply(lambda x: iris_data.target_names[x].capitalize())

In [18]:
#Multiclass Probabilistic Classification
iris_ref_prob = model.predict_proba(iris_ref[iris_data.feature_names])
iris_cur_prob = model.predict_proba(iris_cur[iris_data.feature_names])

target_classes = ['Setosa', 'Versicolor', 'Virginica']

iris_ref_prob_df = pd.DataFrame(iris_ref_prob, columns=target_classes)
iris_cur_prob_df = pd.DataFrame(iris_cur_prob, columns=target_classes)

iris_ref_prob_df['pred'] = model.predict(iris_ref[iris_data.feature_names])
iris_ref_prob_df['target'] = iris_ref.target.apply(lambda x: iris_data.target_names[x].capitalize())
iris_cur_prob_df['pred'] = model.predict(iris_cur[iris_data.feature_names])
iris_cur_prob_df['target'] = iris_cur.target.apply(lambda x: iris_data.target_names[x].capitalize())


In [131]:
#Dataset for Text Features
reviews_data = datasets.fetch_openml(name='Womens-E-Commerce-Clothing-Reviews', version=2, as_frame='auto')
reviews = reviews_data.frame

reviews['prediction'] = reviews['Rating']
reviews_ref = reviews[reviews.Rating > 3].sample(n=5000, replace=True, ignore_index=True, random_state=42)
reviews_cur = reviews[reviews.Rating < 3].sample(n=5000, replace=True, ignore_index=True, random_state=42)

In [142]:
#Dataset for Embeddings
embeddings_data = datasets.fetch_lfw_people()
embeddings_data = pd.DataFrame(embeddings_data['data'])
embeddings_data.columns = ['col_' + str(x) for x in embeddings_data.columns]

embeddings_data = embeddings_data.iloc[:5100, :10]

embeddings_data_shifted = embeddings_data.copy()
embeddings_data_shifted.iloc[2500:5000, :5] = 0

# Column Mapping

## 1. Target, prediction, ID, datetime

In [67]:
column_mapping = ColumnMapping()

column_mapping.target = 'MedHouseVal'
column_mapping.prediction = 'pred'

column_mapping.id = None
column_mapping.datetime = None

In [69]:

regression_performance = TestSuite(tests=[
   RegressionTestPreset()
])
regression_performance.run(reference_data=housing_ref.sort_index(), current_data=housing_cur.sort_index(),column_mapping=column_mapping)
regression_performance

## 2. Categorical and numerical features

In [45]:
column_mapping.numerical_features = ['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude'] 
column_mapping.categorical_features = None 

In [12]:
data_quality_report = Report(metrics=[
    DataQualityPreset()
])

data_quality_report.run(reference_data=housing_ref.sort_index(), current_data=housing_cur.sort_index(),column_mapping=column_mapping)
data_quality_report

## 3. Datetime features

In [47]:
column_mapping = ColumnMapping()

column_mapping.datetime_features = ['weekday','hr','mnth']
column_mapping.datetime = 'date'
column_mapping.numerical_features = ['temp', 'atemp']
column_mapping.categorical_features = ['holiday']
column_mapping.target = 'cnt'

In [53]:
data_drift = Report(metrics = [DataDriftPreset()])
data_drift.run(current_data = bike_current,
               reference_data = bike_reference,
               column_mapping=column_mapping)

data_drift

In [51]:
data_quality_report = Report(metrics=[DataQualityPreset()])
data_quality_report.run(reference_data=bike_reference, 
                        current_data= bike_current,
                        column_mapping=column_mapping)

data_quality_report

## 4. Task parameter for target function

In [17]:

column_mapping = ColumnMapping()
column_mapping.target = 'target'
column_mapping.prediction = 'pred' 
column_mapping.numerical_features = ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']

column_mapping.task = 'classification'

In [18]:
multiclass_cat_target_drift_report = Report(metrics=[
    TargetDriftPreset(num_stattest='ks', cat_stattest='psi'),
])

multiclass_cat_target_drift_report.run(reference_data=iris_ref, current_data=iris_cur,column_mapping=column_mapping)
multiclass_cat_target_drift_report

## 5. Prediction column(s) in classification

## 5.1 Multi-class, option 1

In [19]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'pred'
column_mapping.target_names = ['Setosa', 'Versicolor', 'Virginica']
column_mapping.task = 'classification'

In [20]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass(),
])

classification_report.run(reference_data=iris_ref, current_data=iris_cur,column_mapping=column_mapping)
classification_report

## 5.2. Multi-class, option 2

In [125]:
iris_cur2[['target', 'pred']].sample(5)

Unnamed: 0,target,pred
112,Virginica,Virginica
83,Versicolor,Virginica
76,Versicolor,Versicolor
23,Setosa,Setosa
18,Setosa,Setosa


In [126]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'pred'
column_mapping.task = 'classification'

In [130]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass(),
])

classification_report.run(reference_data=iris_ref2, current_data=iris_cur2,column_mapping=column_mapping)
classification_report

## 5.3. Multi-class probabilistic classification

In [119]:
iris_cur_prob_df[['target', 'Setosa', 'Versicolor', 'Virginica']].sample(5)

Unnamed: 0,target,Setosa,Versicolor,Virginica
127,Virginica,0.0,1.0,0.0
116,Virginica,0.0,0.0,1.0
136,Virginica,0.0,0.666667,0.333333
17,Setosa,0.0,0.0,1.0
121,Virginica,0.0,1.0,0.0


In [23]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = ['Setosa', 'Versicolor', 'Virginica']

In [24]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass(),
])

classification_report.run(reference_data=iris_ref_prob_df, current_data=iris_cur_prob_df,column_mapping=column_mapping)
classification_report

## 5.4. Binary, option 1

In [84]:
bcancer_label_cur[['target','prediction']].sample(5)

Unnamed: 0,target,prediction
173,1,1
72,0,0
93,1,1
164,0,0
407,1,1


In [74]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
column_mapping.target_names = ['Malignant', 'Benign']
pos_label = 'Malignant'

In [75]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass()
])

classification_report.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur,column_mapping=column_mapping)
classification_report

In [27]:
prob_classification_performance_dataset_tests = TestSuite(tests=[
    TestAccuracyScore(),
    TestPrecisionScore(),
    TestRecallScore(),
    TestF1Score()

])

prob_classification_performance_dataset_tests.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur,column_mapping=column_mapping)
prob_classification_performance_dataset_tests

## 5.5. Binary, option 2

In [102]:
bcancer_label_cur[['target','prediction']].sample(5)

Unnamed: 0,target,prediction
78,0,0
155,1,1
93,1,1
46,1,1
497,1,1


In [89]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
column_mapping.pos_label = 1

In [90]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass()
])

classification_report.run(reference_data=bcancer_label_ref, current_data=bcancer_label_cur,column_mapping=column_mapping)
classification_report

## 5.6. Binary probabilistic classification, option 1

In [87]:
bcancer_cur12[['target','Malignant','Benign']].sample(5)

Unnamed: 0,target,Malignant,Benign
173,Malignant,1.0,0.0
43,Benign,0.3,0.7
158,Malignant,1.0,0.0
411,Malignant,1.0,0.0
407,Malignant,1.0,0.0


In [91]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = ['Benign','Malignant']
column_mapping.pos_label = 'Malignant'

In [96]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass()
])

classification_report.run(reference_data=bcancer_ref12, current_data=bcancer_cur12, column_mapping=column_mapping)
classification_report

## 5.7. Binary probabilistic classification, option 2

In [98]:
bcancer_cur12[['target','Malignant','Benign']].sample(5)

Unnamed: 0,target,Malignant,Benign
197,Benign,0.2,0.8
187,Malignant,1.0,0.0
502,Malignant,1.0,0.0
389,Benign,0.0,1.0
108,Benign,0.0,1.0


In [93]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'Benign'
column_mapping.pos_label = 'Malignant'

In [94]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass()
])

classification_report.run(reference_data=bcancer_ref12, current_data=bcancer_cur12, column_mapping=column_mapping)
classification_report

## 5.8. Binary probabilistic classification, option 3

In [101]:
bcancer_cur[['target','prediction']].sample(5)

Unnamed: 0,target,prediction
216,1,1.0
119,0,0.0
215,0,0.2
155,1,1.0
152,1,0.7


In [67]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
pos_label = 1
column_mapping.target_names = ['Malignant', 'Benign']


In [68]:
classification_report = Report(metrics=[
    ClassificationConfusionMatrix(),
    ClassificationQualityByClass()
])

classification_report.run(reference_data=bcancer_ref, current_data=bcancer_cur,column_mapping=column_mapping)
classification_report

## 6. Text features

In [149]:
reviews_cur.sample(2)

Unnamed: 0,Unnamed:_0,Clothing_ID,Age,Title,Review_Text,Rating,Recommended_IND,Positive_Feedback_Count,Division_Name,Department_Name,Class_Name,prediction
1072,13586.0,868.0,37.0,Color not as pictured,I had high hopes for this top based on the pho...,1.0,0.0,0.0,General,Tops,Knits,1.0
1515,2986.0,1082.0,47.0,Average,The dress is too loose and runs too big,2.0,0.0,0.0,General,Dresses,Dresses,2.0


In [138]:
column_mapping = ColumnMapping()

column_mapping.target='Rating'
column_mapping.numerical_features=['Age', 'Positive_Feedback_Count']
column_mapping.categorical_features=['Division_Name', 'Department_Name', 'Class_Name']

column_mapping.text_features=['Review_Text', 'Title']

In [141]:
data_drift_report = Report(metrics=[
    DataDriftPreset(num_stattest='ks', cat_stattest='psi', num_stattest_threshold=0.2, cat_stattest_threshold=0.2),
])

data_drift_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_drift_report

## 7. Embeddings

In [145]:
embeddings_data.sample(5)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
5003,0.0,0.0,0.0,0.0,0.0,0.003922,0.003922,0.003922,0.005229,0.002614
3408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00915,0.035294
1349,0.098039,0.11634,0.126797,0.12549,0.12549,0.129412,0.121569,0.129412,0.121569,0.121569
5064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
column_mapping = ColumnMapping()
column_mapping.embeddings= {'small_subset': embeddings_data.columns[:10]}

In [None]:
report = Report(metrics=[
    EmbeddingsDriftMetric('small_subset')
])

report.run(reference_data = embeddings_data[:2500], current_data = embeddings_data[2500:5000], 
           column_mapping = column_mapping)
report