# Run Ad-Hoc Model Bias Analysis

## Run Bias Analysis In The Notebook using `smclarify`
https://github.com/aws/amazon-sagemaker-clarify

In [None]:
!pip install -q smclarify==0.1

In [None]:
from smclarify.bias.report import *
from smclarify.util.dataset import Datasets, german_lending_readable_values
from typing import Dict
from collections import defaultdict
import pandas as pd

In [None]:
df = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv')
df.shape

In [None]:
df.head()

# Pre-Training Bias Analysis

In [None]:
facet_column = FacetColumn('product_category')
label_column = LabelColumn('star_rating', df['star_rating'], [5])
group_variable = df['product_category']

In [None]:
pre_training_report = bias_report(
    df, 
    facet_column, 
    label_column, 
    stage_type=StageType.PRE_TRAINING, 
    group_variable=group_variable
)

In [None]:
pre_training_report

# Post-Training Bias Analysis

In [None]:
data = {
    'star_rating':  [1, 2, 3, 4, 5],
    'review_body': ['Worst ever', 'Expected more', 'Its ok', 'I like it', 'I love it'],
    'product_category': ['Gift Card', 'Gift Card', 'Gift Card', 'Toys', 'Toys'],
    'star_rating_predicted': [1, 2, 3, 4, 5]
}

In [None]:
df = pd.DataFrame (data, columns = ['star_rating','review_body', 'product_category','star_rating_predicted'])
print (df)

# Convert data columns into `categorical` data type required for Clarify

In [None]:
df['star_rating'] = df['star_rating'].astype('category')

In [None]:
df['star_rating_predicted'] = df['star_rating_predicted'].astype('category')

In [None]:
df['product_category'] = df['product_category'].astype('category')

# Configure Clarify

In [None]:
facet_column = FacetColumn(
    name='product_category', 
    sensitive_values=['Gift Card']
)

In [None]:
label_column = LabelColumn(
    name='star_rating', 
    data=df['star_rating'], 
    positive_label_values=[5])

In [None]:
value = pd.Categorical(5)
print(type(value))

In [None]:
predicted_label_column = LabelColumn(
    name='star_rating_predicted', 
    data=df['star_rating_predicted'], 
    positive_label_values=[5])

In [None]:
group_variable = df['product_category']

In [None]:
post_training_report = bias_report(
    df, 
    facet_column=facet_column, 
    label_column=label_column, 
    stage_type=StageType.POST_TRAINING, 
    predicted_label_column=predicted_label_column,
    metrics=['DPPL', 'DI', 'DCA', 'DCR', 'RD', 'DAR', 'DRR', 'AD', 'CDDPL', 'TE'],
    group_variable=group_variable
)

# Show Post-Training Bias Report

In [None]:
from pprint import pprint
pprint(post_training_report)

# Appendix: Troubleshooting `smclarify` API calls

In [None]:
pos_predicted_index = smclarify.bias.report._positive_predicted_index(
    predicted_label_data=df['star_rating_predicted'],
    label_data=df['star_rating'],
    positive_label_values=[5]
)

In [None]:
print(pos_predicted_index)

In [None]:
print(~pos_predicted_index.all())

In [None]:
import numpy as np

In [None]:
if len(df.columns) != len(df.select_dtypes([np.number, bool]).columns):
    print(len(df.columns))
    print(len(df.select_dtypes([np.number, bool]).columns))

In [None]:
def _my_positive_predicted_index(
    predicted_label_data: pd.Series, label_data: pd.Series, positive_label_values: List[Any]
) -> pd.Series:
    """
    creates a list of bool series for positive predicted label index based on the input data type,
    list of positive label values or intervals
    :param predicted_label_data: input data for predicted label column
    :param label_datatype:  input data for the label column
    :param positive_label_values: list of positive label values
    :return: list of positive predicted label index series
    """
    predicted_label_datatype = common.series_datatype(predicted_label_data, positive_label_values)
    label_datatype = common.series_datatype(label_data, positive_label_values)
    if predicted_label_datatype != label_datatype:
        raise ValueError("Predicted Label Column series datatype is not the same as Label Column series")
    try:
        predicted_label_data = predicted_label_data.astype(label_data.dtype)
    except ValueError as e:
        raise ValueError(
            "Labels and predicted labels cannot have different types (%s, %s)."
            % (label_data.dtype, predicted_label_data.dtype)
        )
    if predicted_label_datatype == common.DataType.CONTINUOUS:
        pass
#        data_interval_indices = _interval_index(label_data.append(predicted_label_data), positive_label_values)
#        positive_predicted_index = _continuous_data_idx(predicted_label_data, data_interval_indices)
    elif predicted_label_datatype == common.DataType.CATEGORICAL and positive_label_values:
        positive_predicted_index = _categorical_data_idx(predicted_label_data, positive_label_values)
    else:
        raise RuntimeError("Predicted Label_column data is invalid or can't be classified")
    # check if positive index boolean series has all False values
    if (~positive_predicted_index).all():
        raise ValueError(
            "No Label values are present in the predicted Label Column,"
            "Positive Predicted Index Series contains all False values"
        )
    return positive_predicted_index

In [None]:
predicted_label_data=df['star_rating_predicted']
positive_label_values=['5']

In [None]:
predicted_label_datatype = common.series_datatype(predicted_label_data, positive_label_values)
print(predicted_label_datatype)

In [None]:
# pos_predicted_index, inversed = _my_positive_predicted_index(
#     predicted_label_data=df['star_rating_predicted'],
#     label_data=df['star_rating'],
#     positive_label_values=[5]
# )