# Run Ad-Hoc Model Bias Analysis

## Run Bias Analysis In The Notebook using `smclarify`
https://github.com/aws/amazon-sagemaker-clarify

In [1]:
!pip install -q smclarify==0.1

In [2]:
from smclarify.bias.report import *
from smclarify.util.dataset import Datasets, german_lending_readable_values
from typing import Dict
from collections import defaultdict
import pandas as pd

In [3]:
df_pre_training = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv')
df_pre_training.shape

(81040, 15)

In [4]:
df_pre_training.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43630866,R3K6VPY2NE6262,B004LLIKVU,473048287,Amazon.com eGift Cards,Gift Card,5,0,0,N,Y,works great,"i like the gift cards, if you need something q...",2013-10-27
1,US,52721479,R2VSBSVBPO5N94,B00CT77B7M,473048287,Amazon.com eGift Cards,Gift Card,5,0,0,N,Y,Five Stars,"Beautiful card, the person receiving this also...",2015-05-25
2,US,25341626,RC62YY8GHYO67,B004LLIKVU,473048287,Amazon.com eGift Cards,Gift Card,5,0,0,N,Y,Five Stars,The Amazon Gift Card is the Perfect Gift somet...,2015-05-15
3,US,42898788,R35VSTQDN6CPDG,B004LLIKVU,473048287,Amazon.com eGift Cards,Gift Card,5,0,0,N,Y,Five Stars,ilove,2014-10-29
4,US,2943280,R3RA8FDVJDSKB5,B00H5BMH44,81025991,Amazon eGift Card - Hoops and Yoyo Cake Face (...,Gift Card,5,0,0,N,Y,Five Stars,I love Hoops and Yoyo. It is fun.,2014-12-10


# Pre-Training Bias Analysis

In [5]:
facet_column = FacetColumn('product_category')
label_column = LabelColumn('star_rating', df_pre_training['star_rating'], [5, 4])
group_variable = df_pre_training['product_category']

In [6]:
pre_training_report = bias_report(
    df_pre_training, 
    facet_column, 
    label_column, 
    stage_type=StageType.PRE_TRAINING, 
    group_variable=group_variable
)

In [7]:
pre_training_report

[{'value_or_threshold': 'Gift Card',
  'metrics': [{'name': 'CDDL',
    'description': 'Conditional Demographic Disparity in Labels (CDDL)',
    'value': 0.0},
   {'name': 'CI',
    'description': 'Class Imbalance (CI)',
    'value': 0.5890671273445213},
   {'name': 'DPL',
    'description': 'Difference in Positive Proportions in Labels (DPL)',
    'value': -0.3670205096923802},
   {'name': 'JS',
    'description': 'Jensen-Shannon Divergence (JS)',
    'value': 0.09279888452625618},
   {'name': 'KL',
    'description': 'Kullback-Liebler Divergence (KL)',
    'value': 0.2839525028450173},
   {'name': 'KS',
    'description': 'Kolmogorov-Smirnov Distance (KS)',
    'value': 0.3670205096923802},
   {'name': 'LP', 'description': 'L-p Norm (LP)', 'value': 0.51904538247605},
   {'name': 'TVD',
    'description': 'Total Variation Distance (TVD)',
    'value': 0.3670205096923802}]},
 {'value_or_threshold': 'Digital_Video_Games',
  'metrics': [{'name': 'CDDL',
    'description': 'Conditional De

# Post-Training Bias Analysis

## _TODO: Implement Batch Prediction_

In [None]:
data = {
    'star_rating':  [1, 2, 3, 4, 5],
    'review_body': ['Worst ever', 'Expected more', 'Its ok', 'I like it', 'I love it'],
    'product_category': ['Gift Card', 'Gift Card', 'Gift Card', 'Digital_Software', 'Digital_Software'],
    'star_rating_predicted': [1, 2, 3, 4, 5]
}

In [None]:
df = pd.DataFrame(data, columns = ['star_rating','review_body', 'product_category','star_rating_predicted'])
print (df)

# Convert data columns into `categorical` data type required for Clarify

In [None]:
df['star_rating'] = df['star_rating'].astype('category')

In [None]:
df['star_rating_predicted'] = df['star_rating_predicted'].astype('category')

In [None]:
df['product_category'] = df['product_category'].astype('category')

# Configure Clarify

In [13]:
facet_column = FacetColumn(
    name='product_category', 
    sensitive_values=['Gift Card']
)

In [14]:
label_column = LabelColumn(
    name='star_rating', 
    data=df['star_rating'], 
    positive_label_values=[5,4])

In [15]:
predicted_label_column = LabelColumn(
    name='star_rating_predicted', 
    data=df['star_rating_predicted'], 
    positive_label_values=[5,4])

In [16]:
group_variable = df['product_category']

In [17]:
post_training_report = bias_report(
    df, 
    facet_column=facet_column, 
    label_column=label_column, 
    stage_type=StageType.POST_TRAINING, 
    predicted_label_column=predicted_label_column,
    metrics=['DPPL', 'DI', 'DCA', 'DCR', 'RD', 'DAR', 'DRR', 'AD', 'CDDPL', 'TE'],
    group_variable=group_variable
)

# Show Post-Training Bias Report

In [18]:
from pprint import pprint
pprint(post_training_report)

[{'metrics': [{'description': 'Accuracy Difference (AD)',
               'name': 'AD',
               'value': 0.0},
              {'description': 'Conditional Demographic Disparity in Predicted '
                              'Labels (CDDPL)',
               'name': 'CDDPL',
               'value': 0.6},
              {'description': 'Difference in Acceptance Rates (DAR)',
               'name': 'DAR',
               'value': 1.0},
              {'description': 'Difference in Conditional Acceptance (DCA)',
               'name': 'DCA',
               'value': 1.0},
              {'description': 'Difference in Conditional Rejection (DCR)',
               'name': 'DCR',
               'value': 1.0},
              {'description': 'Disparate Impact (DI)',
               'name': 'DI',
               'value': 0.0},
              {'description': 'Difference in Positive Proportions in Predicted '
                              'Labels (DPPL)',
               'name': 'DPPL',
               'val

# Release Resources

In [19]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [20]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>