# Run Post-Training Bias Analysis With `smclarify`

https://github.com/aws/amazon-sagemaker-clarify

In [1]:
!pip install -q smclarify==0.1

In [2]:
from smclarify.bias.report import *
from smclarify.util.dataset import Datasets, german_lending_readable_values
from typing import Dict
from collections import defaultdict
import pandas as pd

In [3]:
df = pd.read_csv('./amazon_reviews_us_giftcards_software_videogames_balanced.csv')
df.shape

(81040, 15)

In [4]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43630866,R3K6VPY2NE6262,B004LLIKVU,473048287,Amazon.com eGift Cards,Gift Card,5,0,0,N,Y,works great,"i like the gift cards, if you need something q...",2013-10-27
1,US,52721479,R2VSBSVBPO5N94,B00CT77B7M,473048287,Amazon.com eGift Cards,Gift Card,5,0,0,N,Y,Five Stars,"Beautiful card, the person receiving this also...",2015-05-25
2,US,25341626,RC62YY8GHYO67,B004LLIKVU,473048287,Amazon.com eGift Cards,Gift Card,5,0,0,N,Y,Five Stars,The Amazon Gift Card is the Perfect Gift somet...,2015-05-15
3,US,42898788,R35VSTQDN6CPDG,B004LLIKVU,473048287,Amazon.com eGift Cards,Gift Card,5,0,0,N,Y,Five Stars,ilove,2014-10-29
4,US,2943280,R3RA8FDVJDSKB5,B00H5BMH44,81025991,Amazon eGift Card - Hoops and Yoyo Cake Face (...,Gift Card,5,0,0,N,Y,Five Stars,I love Hoops and Yoyo. It is fun.,2014-12-10


# Pre-Training Bias Analysis

In [5]:
facet_column = FacetColumn('product_category')
label_column = LabelColumn('star_rating', df['star_rating'], [5])
group_variable = df['product_category']

In [6]:
pre_training_report = bias_report(
    df, 
    facet_column, 
    label_column, 
    stage_type=StageType.PRE_TRAINING, 
    group_variable=group_variable
)

In [7]:
pre_training_report

[{'value_or_threshold': 'Gift Card',
  'metrics': [{'name': 'CDDL',
    'description': 'Conditional Demographic Disparity in Labels (CDDL)',
    'value': 0.0},
   {'name': 'CI',
    'description': 'Class Imbalance (CI)',
    'value': 0.5890671273445213},
   {'name': 'DPL',
    'description': 'Difference in Positive Proportions in Labels (DPL)',
    'value': -0.3679426717770344},
   {'name': 'JS',
    'description': 'Jensen-Shannon Divergence (JS)',
    'value': 0.11632161004661548},
   {'name': 'KL',
    'description': 'Kullback-Liebler Divergence (KL)',
    'value': 0.3061581684888518},
   {'name': 'KS',
    'description': 'Kolmogorov-Smirnov Distance (KS)',
    'value': 0.36794267177703444},
   {'name': 'LP', 'description': 'L-p Norm (LP)', 'value': 0.5203495166028743},
   {'name': 'TVD',
    'description': 'Total Variation Distance (TVD)',
    'value': 0.36794267177703444}]},
 {'value_or_threshold': 'Digital_Video_Games',
  'metrics': [{'name': 'CDDL',
    'description': 'Conditiona

# Post-Training Bias Analysis

In [61]:
data = {
    'star_rating':  [1, 2, 3, 4, 5],
    'review_body': ['Worst ever', 'Expected more', 'Its ok', 'I like it', 'I love it'],
    'product_category': ['Gift Card', 'Gift Card', 'Gift Card', 'Toys', 'Toys'],
    'star_rating_predicted': [1, 2, 3, 4, 5]
}

In [62]:
df = pd.DataFrame (data, columns = ['star_rating','review_body', 'product_category','star_rating_predicted'])
print (df)

   star_rating    review_body product_category  star_rating_predicted
0            1     Worst ever        Gift Card                      1
1            2  Expected more        Gift Card                      2
2            3         Its ok        Gift Card                      3
3            4      I like it             Toys                      4
4            5      I love it             Toys                      5


# Convert data columns into `categorical` data type required for Clarify

In [63]:
df['star_rating'] = df['star_rating'].astype('category')

In [64]:
df['star_rating_predicted'] = df['star_rating_predicted'].astype('category')

In [65]:
df['product_category'] = df['product_category'].astype('category')

# Configure Clarify

In [66]:
facet_column = FacetColumn(
    name='product_category', 
    sensitive_values=['Gift Card']
)

In [67]:
label_column = LabelColumn(
    name='star_rating', 
    data=df['star_rating'], 
    positive_label_values=[5])

In [82]:
value = pd.Categorical(5)
print(type(value))

<class 'pandas.core.arrays.categorical.Categorical'>


In [68]:
predicted_label_column = LabelColumn(
    name='star_rating_predicted', 
    data=df['star_rating_predicted'], 
    positive_label_values=[5])

In [69]:
group_variable = df['product_category']

In [79]:
post_training_report = bias_report(
    df, 
    facet_column=facet_column, 
    label_column=label_column, 
    stage_type=StageType.POST_TRAINING, 
    predicted_label_column=predicted_label_column,
    metrics=['DPPL', 'DI', 'DCA', 'DCR', 'RD', 'DAR', 'DRR', 'AD', 'CDDPL', 'TE'],
    group_variable=group_variable
)

# Show Post-Training Bias Report

In [72]:
from pprint import pprint
pprint(post_training_report)

[{'metrics': [{'description': 'Accuracy Difference (AD)',
               'name': 'AD',
               'value': 0.0},
              {'description': 'Conditional Demographic Disparity in Predicted '
                              'Labels (CDDPL)',
               'name': 'CDDPL',
               'value': 0.6},
              {'description': 'Difference in Acceptance Rates (DAR)',
               'name': 'DAR',
               'value': 1.0},
              {'description': 'Difference in Conditional Acceptance (DCA)',
               'name': 'DCA',
               'value': 1.0},
              {'description': 'Difference in Conditional Rejection (DCR)',
               'name': 'DCR',
               'value': 0.0},
              {'description': 'Disparate Impact (DI)',
               'name': 'DI',
               'value': 0.0},
              {'description': 'Difference in Positive Proportions in Predicted '
                              'Labels (DPPL)',
               'name': 'DPPL',
               'val

# Appendix: Troubleshooting `smclarify` API calls

In [73]:
pos_predicted_index = smclarify.bias.report._positive_predicted_index(
    predicted_label_data=df['star_rating_predicted'],
    label_data=df['star_rating'],
    positive_label_values=[5]
)

In [74]:
print(pos_predicted_index)

0    False
1    False
2    False
3    False
4     True
Name: star_rating_predicted, dtype: bool


In [75]:
print(~pos_predicted_index.all())

True


In [76]:
import numpy as np

In [77]:
if len(df.columns) != len(df.select_dtypes([np.number, bool]).columns):
    print(len(df.columns))
    print(len(df.select_dtypes([np.number, bool]).columns))

4
0


In [23]:
def _my_positive_predicted_index(
    predicted_label_data: pd.Series, label_data: pd.Series, positive_label_values: List[Any]
) -> pd.Series:
    """
    creates a list of bool series for positive predicted label index based on the input data type,
    list of positive label values or intervals
    :param predicted_label_data: input data for predicted label column
    :param label_datatype:  input data for the label column
    :param positive_label_values: list of positive label values
    :return: list of positive predicted label index series
    """
    predicted_label_datatype = common.series_datatype(predicted_label_data, positive_label_values)
    label_datatype = common.series_datatype(label_data, positive_label_values)
    if predicted_label_datatype != label_datatype:
        raise ValueError("Predicted Label Column series datatype is not the same as Label Column series")
    try:
        predicted_label_data = predicted_label_data.astype(label_data.dtype)
    except ValueError as e:
        raise ValueError(
            "Labels and predicted labels cannot have different types (%s, %s)."
            % (label_data.dtype, predicted_label_data.dtype)
        )
    if predicted_label_datatype == common.DataType.CONTINUOUS:
        pass
#        data_interval_indices = _interval_index(label_data.append(predicted_label_data), positive_label_values)
#        positive_predicted_index = _continuous_data_idx(predicted_label_data, data_interval_indices)
    elif predicted_label_datatype == common.DataType.CATEGORICAL and positive_label_values:
        positive_predicted_index = _categorical_data_idx(predicted_label_data, positive_label_values)
    else:
        raise RuntimeError("Predicted Label_column data is invalid or can't be classified")
    # check if positive index boolean series has all False values
    if (~positive_predicted_index).all():
        raise ValueError(
            "No Label values are present in the predicted Label Column,"
            "Positive Predicted Index Series contains all False values"
        )
    return positive_predicted_index

In [25]:
predicted_label_data=df['star_rating_predicted']
positive_label_values=['5']

In [27]:
predicted_label_datatype = common.series_datatype(predicted_label_data, positive_label_values)
print(predicted_label_datatype)

DataType.CATEGORICAL


In [None]:
pos_predicted_index, inversed = _my_positive_predicted_index(
    predicted_label_data=df['star_rating_predicted'],
    label_data=df['star_rating'],
    positive_label_values=[5]
)