# Run Ad-Hoc Data Bias Analysis

## Run Bias Analysis In The Notebook using `smclarify`
https://github.com/aws/amazon-sagemaker-clarify


In [None]:
!pip install -q smclarify==0.1

In [None]:
!pip list | grep smclarify

In [None]:
from smclarify.bias import report
from typing import Dict
from collections import defaultdict
import pandas as pd
import seaborn as sns

# Read Dataset From S3

In [None]:
%store -r bias_data_s3_uri

In [None]:
print(bias_data_s3_uri)

In [None]:
%store -r balanced_bias_data_s3_uri

In [None]:
print(balanced_bias_data_s3_uri)

In [None]:
!aws s3 cp $bias_data_s3_uri ./data-clarify/

In [None]:
!aws s3 cp $balanced_bias_data_s3_uri ./data-clarify/

In [None]:
df = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv')
df.shape

In [None]:
df_balanced = pd.read_csv('./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv')
df_balanced.shape

# Visualize Data

In [None]:
sns.countplot(df['star_rating'])

In [None]:
sns.countplot(df_balanced['star_rating'])

# Show Number of Reviews per Category and Star Rating

In [None]:
df_balanced.shape

In [None]:
num_stars = df_balanced.groupby(['star_rating'], sort=False).size().reset_index(name='Count')
print(num_stars)

In [None]:
num = df_balanced.groupby(['product_category','star_rating'], sort=False).size().reset_index(name='Count')
print(num)

# Calculate Pre-Training Bias Metrics

## Define 
* Facet Column (= Product Category), 
* Label Column (= Star Rating), 
* Positive Label Value (= 5)

In [None]:
facet_column = report.FacetColumn(name='product_category')
label_column = report.LabelColumn(name='star_rating', 
                                  data=df_balanced['star_rating'], 
                                  positive_label_values=[5, 4] # this doesn't matter for class-imbalance bias
                                 )
group_variable = df_balanced['product_category']


## Run SageMaker Clarify Bias Report

In [None]:
report.bias_report(df_balanced, 
                   facet_column, 
                   label_column, 
                   stage_type=report.StageType.PRE_TRAINING, 
                   group_variable=group_variable
                  )

# EXTRA: Create A Completely Balanced Dataset Across Categories AND Star Ratings

In [None]:
# df_giftcard5 = df_balanced.query('star_rating==5 and product_category=="Gift Card"')
# df_giftcard5=df_giftcard5[:343]
# df_giftcard5.shape

In [None]:
# df_giftcard4 = df_balanced.query('star_rating==4 and product_category=="Gift Card"')
# df_giftcard4=df_giftcard4[:343]
# df_giftcard4.shape

In [None]:
# df_giftcard3 = df_balanced.query('star_rating==3 and product_category=="Gift Card"')
# df_giftcard3=df_giftcard3[:343]
# df_giftcard3.shape

In [None]:
# df_giftcard2 = df_balanced.query('star_rating==2 and product_category=="Gift Card"')
# df_giftcard2=df_giftcard2[:343]
# df_giftcard2.shape

In [None]:
# df_giftcard1 = df_balanced.query('star_rating==1 and product_category=="Gift Card"')
# df_giftcard1=df_giftcard1[:343]
# df_giftcard1.shape

In [None]:
# df_videogames5 = df_balanced.query('star_rating==5 and product_category=="Digital_Video_Games"')
# df_videogames5=df_videogames5[:343]
# df_videogames5.shape

In [None]:
# df_videogames4 = df_balanced.query('star_rating==4 and product_category=="Digital_Video_Games"')
# df_videogames4=df_videogames4[:343]
# df_videogames4.shape

In [None]:
# df_videogames3 = df_balanced.query('star_rating==3 and product_category=="Digital_Video_Games"')
# df_videogames3=df_videogames3[:343]
# df_videogames3.shape

In [None]:
# df_videogames2 = df_balanced.query('star_rating==2 and product_category=="Digital_Video_Games"')
# df_videogames2=df_videogames2[:343]
# df_videogames2.shape

In [None]:
# df_videogames1 = df_balanced.query('star_rating==1 and product_category=="Digital_Video_Games"')
# df_videogames1=df_videogames1[:343]
# df_videogames1.shape

In [None]:
# df_software5 = df_balanced.query('star_rating==5 and product_category=="Digital_Software"')
# df_software5=df_software5[:343]
# df_software5.shape

In [None]:
# df_software4 = df_balanced.query('star_rating==4 and product_category=="Digital_Software"')
# df_software4=df_software4[:343]
# df_software4.shape

In [None]:
# df_software3 = df_balanced.query('star_rating==3 and product_category=="Digital_Software"')
# df_software3=df_software3[:343]
# df_software3.shape

In [None]:
# df_software2 = df_balanced.query('star_rating==2 and product_category=="Digital_Software"')
# df_software2=df_software2[:343]
# df_software2.shape

In [None]:
# df_software1 = df_balanced.query('star_rating==1 and product_category=="Digital_Software"')
# df_software1=df_software1[:343]
# df_software1.shape

## Generate 4th Sample

In [None]:
# df_bla5 = df_balanced.query('star_rating==5 and product_category=="Digital_Software"')
# df_bla5['product_category'] = 'bla' # df_bla5['product_category'].str.replace('Digital_Software', 'bla')
# df_bla5=df_bla5[:343]
# df_bla5.shape

In [None]:
# df_bla4 = df_balanced.query('star_rating==4 and product_category=="Digital_Software"')
# df_bla4['product_category'] = 'bla' # df_bla4['product_category'].str.replace('Digital_Software', 'bla')
# df_bla4=df_bla4[:343]
# df_bla4.shape

In [None]:
# df_bla3 = df_balanced.query('star_rating==3 and product_category=="Digital_Software"')
# df_bla3['product_category'] = 'bla' # df_bla3['product_category'].str.replace('Digital_Software', 'bla')

# df_bla3=df_bla3[:343]
# df_bla3.shape

In [None]:
# df_bla2 = df_balanced.query('star_rating==2 and product_category=="Digital_Software"')
# df_bla2['product_category'] = 'bla' # df_bla2['product_category'].str.replace('Digital_Software', 'bla')

# df_bla2=df_bla2[:343]
# df_bla2.shape

In [None]:
# df_bla1 = df_balanced.query('star_rating==1 and product_category=="Digital_Software"')
# df_bla1['product_category'] = 'bla' # df_bla1['product_category'].str.replace('Digital_Software', 'bla')
# df_bla1=df_bla1[:343]
# df_bla1.shape

## Generate 5th Sample

In [None]:
# df_blah5 = df_balanced.query('star_rating==5 and product_category=="Digital_Software"')
# df_blah5['product_category'] = 'blah' # df_bla5['product_category'].str.replace('Digital_Software', 'bla')
# df_blah5=df_blah5[:343]
# df_blah5.shape

In [None]:
# df_blah4 = df_balanced.query('star_rating==4 and product_category=="Digital_Software"')
# df_blah4['product_category'] = 'blah' # df_bla4['product_category'].str.replace('Digital_Software', 'bla')
# df_blah4=df_blah4[:343]
# df_blah4.shape

In [None]:
# df_blah3 = df_balanced.query('star_rating==3 and product_category=="Digital_Software"')
# df_blah3['product_category'] = 'bla' # df_bla3['product_category'].str.replace('Digital_Software', 'bla')

# df_blah3=df_blah3[:343]
# df_blah3.shape

In [None]:
# df_blah2 = df_balanced.query('star_rating==2 and product_category=="Digital_Software"')
# df_blah2['product_category'] = 'blah' # df_bla2['product_category'].str.replace('Digital_Software', 'bla')

# df_blah2=df_blah2[:343]
# df_blah2.shape

In [None]:
# df_blah1 = df_balanced.query('star_rating==1 and product_category=="Digital_Software"')
# df_blah1['product_category'] = 'blah' # df_bla1['product_category'].str.replace('Digital_Software', 'bla')
# df_blah1=df_blah1[:343]
# df_blah1.shape

# Concatenate All Data Frames Into `Super` Balanced Data Frame

In [None]:
# df_super_balanced = pd.concat([df_giftcard5, 
# #                               df_giftcard4, 
#                                df_giftcard3, 
# #                               df_giftcard2, 
#                                df_giftcard1, 
#                                df_videogames5, 
# #                               df_videogames4, 
#                                df_videogames3, 
# #                               df_videogames2, 
#                                df_videogames1, 
#                               df_software5, 
# #                               df_software4, 
#                                df_software3, 
# #                               df_software2, 
#                               df_software1,
# #                               df_bla5,
# #                                df_bla4,
# #                                df_bla3,
# #                                df_bla2,                               
# #                               df_bla1,
# #                               df_blah5,
# #                                df_blah4,
# #                                df_blah3,
# #                                df_blah2,                               
# #                               df_blah1                               
#                               ], ignore_index=True, sort=False)
# df_super_balanced.shape