# Run Model Explanation with SageMaker Clarify (Post-Training)

## Using SageMake Processing Jobs

In [None]:
import boto3
import sagemaker
import pandas as pd
import numpy as np

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
%store -r training_job_name

In [None]:
try:
    training_job_name
    print('[OK]')
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')
    print('+++++++++++++++++++++++++++++++')

In [None]:
print(training_job_name)

# Get Data

In [None]:
import pandas as pd

data = pd.read_json('data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.jsonl', lines=True)
data.head()

In [None]:
data.shape

### Data inspection
Plotting histograms for the distribution of the different features is a good way to visualize the data. Let's plot a few of the features that can be considered _sensitive_.  
Let's take a look specifically at the Sex feature of a census respondent. In the first plot we see that there are fewer Female respondents as a whole but especially in the positive outcomes, where they form ~$\frac{1}{7}$th of respondents.

In [None]:
data['star_rating'].value_counts().sort_values().plot(kind='bar', title='Count Reviews per Category', rot=0)

In [None]:
data['product_category'].value_counts().sort_values().plot(kind='bar', title='Count Reviews per Category', rot=0)

# Create Model

In [None]:
%store -r training_job_name

In [None]:
print(training_job_name)

In [None]:
import sagemaker

inference_image_uri = sagemaker.image_uris.retrieve(
    framework="tensorflow",
    region=region,
    version="2.3.1",
    py_version="py37",
    instance_type='ml.m5.4xlarge',
    image_scope="inference"
)
print(inference_image_uri)

In [None]:
model_name = sess.create_model_from_job(
    training_job_name=training_job_name,
    image_uri=inference_image_uri
)

In [None]:
print(model_name)

# Explaining Predictions with Amazon SageMaker Clarify

There are expanding business needs and legislative regulations that require explainations of _why_ a model mades the decision it did. SageMaker Clarify uses SHAP to explain the contribution that each input feature makes to the final decision.

In [None]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(role=role,
                                                      instance_count=1,
                                                      instance_type='ml.c5.2xlarge',
                                                      sagemaker_session=sess)

## Writing DataConfig and ModelConfig
A `DataConfig` object communicates some basic information about data I/O to Clarify. We specify where to find the input dataset, where to store the output, the target column (`label`), the header names, and the dataset type.

Similarly, the `ModelConfig` object communicates information about your trained model and `ModelPredictedLabelConfig` provides information on the format of your predictions.  

**Note**: To avoid additional traffic to your production models, SageMaker Clarify sets up and tears down a dedicated endpoint when processing. `ModelConfig` specifies your preferred instance type and instance count used to run your model on during Clarify's processing.

In [None]:
post_train_dataset = data[['star_rating', 'product_category', 'review_body']]

In [None]:
post_train_dataset.shape

In [None]:
post_train_dataset.head()

# Select n samples across each category

## _Note: We need to have >1 product_categories (facet values) in our training data._

In [None]:
# TODO:  CHANGE THIS BACK TO .sample(10)
# TODO:  REMOVE THE INDEX (0, 1, etc)
# TODO:  REMOVE THE DUPLICATE product_category IN THIS GROUPBY
# TODO:  WHY ARE WE USING groupby?
bias_data = post_train_dataset.groupby('product_category', group_keys=False).apply(lambda s: s.sample(1))
bias_data.reset_index(drop=True, inplace=True)
bias_data.shape

In [None]:
bias_data

# Convert To Categorical dtype

## _Note: Otherwise Clarify converts (int) and (str) to continuous threshold values instead of categorial values._

In [None]:
bias_data['product_category'] = bias_data['product_category'].astype('category')

In [None]:
bias_data['product_category'].dtype

In [None]:
bias_data['star_rating'] = bias_data['star_rating'].astype('category')

In [None]:
bias_data['star_rating'].dtype

In [None]:
bias_data

# Create Explainability Data Set Without Label Column

In [None]:
explainability_data = bias_data.drop(['star_rating'], axis=1)

In [None]:
explainability_data.shape

In [None]:
explainability_data.head()

# Convert to `jsonlines` Format and Upload To S3

In [None]:
path = './data-clarify/post_train_data.jsonl'
data.to_json(path, orient="records", lines=True)

In [None]:
# TODO:  Make this prefix more unique, if possible.  Perhaps use $bucket/$training_job_name/data-clarify/post_train_data.jsonlines
post_train_dataset_s3_uri = sess.upload_data(bucket=bucket, key_prefix=training_job_name, path=path)
post_train_dataset_s3_uri

# Configure Clarify

In [None]:
from sagemaker import clarify

model_config = clarify.ModelConfig(model_name=model_name,
                                   instance_type='ml.m5.4xlarge',
                                   instance_count=1,
                                   content_type='application/jsonlines',
                                   accept_type='application/jsonlines',
                                   content_template='{\"review_body\":$features}')

In [None]:
shap_config = clarify.SHAPConfig(baseline=[explainability_data.iloc[0].values.tolist()],
                                 num_samples=1,
                                 agg_method='mean_abs')

In [None]:
explainability_output_path = 's3://{}/clarify-explainability'.format(bucket)

explainability_data_config = clarify.DataConfig(s3_data_input_path=post_train_dataset_s3_uri,
                                s3_output_path=explainability_output_path,
                                label='star_rating',
                                headers=post_train_dataset.columns.to_list(),
                                features='review_body',
                                dataset_type='application/jsonlines')

## _Note: `label` is set to the JSON key for the model prediction results_

In [None]:
#predictions_config = clarify.ModelPredictedLabelConfig(label='predicted_label')

## Run Clarify

In [None]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(role=role,
                                                      instance_count=1,
                                                      instance_type='ml.c5.2xlarge',
                                                      sagemaker_session=sess)

In [None]:
clarify_processor.run_explainability(data_config=explainability_data_config,
                                     model_config=model_config,
                                     model_scores='predicted_label',
                                     explainability_config=shap_config)

#### Viewing the Explainability Report
As with the bias report, you can view the explainability report in Studio under the experiments tab


<img src="img/explainability_detail.gif">

The Model Insights tab contains direct links to the report and model insights.

If you're not a Studio user yet, as with the Bias Report, you can access this report at the following S3 bucket.

# Download Report From S3

In [None]:
!aws s3 ls $explainability_output_path/

In [None]:
!aws s3 cp $explainability_output_path/report.html ./explainability_report.html