Notebook will only run correctly in AWS environment

In [None]:
!pip install --disable-pip-version-check -q sagemaker==2.35.0

In [None]:
import boto3
from   IPython.core.display import display, HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import clarify
import seaborn as sns

%config InlineBackend.figure_format='retina'

In [None]:
sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [None]:
!aws s3 cp s3://dlai-practical-data-science/data/transformed/womens_clothing_ecommerce_reviews_transformed.csv ./

In [None]:
path = './womens_clothing_ecommerce_reviews_transformed.csv'
df = pd.read_csv(path)
df.head()

In [None]:
sns.countplot(data=df, x='sentiment', hue='product_category')
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1));

In [None]:
data_s3_uri_unbalanced = sess.upload_data(
    bucket=bucket, key_prefix='bias/unbalanced', path=path)
data_s3_uri_unbalanced

In [None]:
display(
    HTML(
        f'<b>Review <a target="top" href="https://s3.console.aws.amazon'
        f'.com/s3/home?region={region}#">Amazon S3 bucket</a></b>'))

In [None]:
bias_report_unbalanced_output_path = (
    f's3://{bucket}/bias/generated_bias_report/unbalanced'
data_config_unbalanced = clarify.DataConfig(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    s3_data_input_path=path, # Replace None
    s3_output_path=bias_report_unbalanced_output_path, # Replace None
    label='sentiment', # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    headers=df.columns.to_list(),
    dataset_type='text/csv')

In [None]:
bias_config_unbalanced = clarify.BiasConfig(
    label_values_or_threshold=[1], # desired sentiment
    facet_name='product_category') # sensitive column (facet)

In [None]:
clarify_processor_unbalanced = clarify.SageMakerClarifyProcessor(
    role=role, 
    instance_count=1, 
    instance_type='ml.m5.large', 
    sagemaker_session=sess)

In [None]:
clarify_processor_unbalanced.run_pre_training_bias(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    data_config=data_config_unbalanced, # Replace None
    data_bias_config=bias_config_unbalanced, # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    methods=['CI', 'DPL', 'KL', 'JS', 'LP', 'TVD', 'KS'],
    wait=False,
    logs=False)

In [None]:
run_unbalanced_bias_processing_job_name = (
    clarify_processor_unbalanced.latest_job.job_name)
print(run_unbalanced_bias_processing_job_name)

In [None]:
display(
    HTML(
        f'<b>Review <a target="blank" href="https://console.aws.amazon'
        f'.com/sagemaker/home?region={region}#/'
        f'processing-jobs/{run_unbalanced_bias_processing_job_name}">'
        f'processing job</a></b>'))

In [None]:
display(
    HTML(
        f'<b>Review <a target="blank" href="https://console.aws.amazon'
        f'.com/cloudwatch/home?region={region}#logStream:group=/aws/'
        f'sagemaker/ProcessingJobs;'
        f'prefix={run_unbalanced_bias_processing_job_name};'
        f'streamFilter=typeLogStreamPrefix">CloudWatch logs</a> after '
        f'about 5 minutes</b>'))

In [None]:
running_processor = (
    sagemaker.processing.ProcessingJob.from_processing_name(
        processing_job_name=run_unbalanced_bias_processing_job_name,
        sagemaker_session=sess))

In [None]:
%%time
running_processor.wait(logs=False)

In [None]:
!aws s3 ls $bias_report_unbalanced_output_path/

In [None]:
!aws s3 cp --recursive $bias_report_unbalanced_output_path ./generated_bias_report/unbalanced/

In [None]:
display(
    HTML(
        '<b>Review <a target="blank" href="./generated_bias_report/'
        'unbalanced/report.html">unbalanced bias report</a></b>'))

In [None]:
df_grouped_by = df.groupby(['product_category', 'sentiment'])
df_balanced = df_grouped_by.apply(
    lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))
df_balanced

In [None]:
sns.countplot(data=df_balanced, x='sentiment', hue='product_category')
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1));

In [None]:
path_balanced = './womens_clothing_ecommerce_reviews_balanced.csv'
df_balanced.to_csv(path_balanced, index=False, header=True)
data_s3_uri_balanced = sess.upload_data(
    bucket=bucket, key_prefix='bias/balanced', path=path_balanced)
data_s3_uri_balanced

In [None]:
display(
    HTML(
        f'<b>Review <a target="top" href="https://s3.console.aws.amazon'
        f'.com/s3/home?region={region}#">Amazon S3 bucket</a></b>'))

In [None]:
bias_report_balanced_output_path = (
    f's3://{bucket}/bias/generated_bias_report/balanced')
data_config_balanced = clarify.DataConfig(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    s3_data_input_path=path_balanced, # Replace None
    s3_output_path=bias_report_balanced_output_path, # Replace None
    label='sentiment', # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    headers=df_balanced.columns.to_list(),
    dataset_type='text/csv')

In [None]:
bias_config_balanced = clarify.BiasConfig(
    label_values_or_threshold=[1], # desired sentiment
    facet_name='product_category') # sensitive column (facet)

In [None]:
clarify_processor_balanced = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    sagemaker_session=sess)

In [None]:
clarify_processor_balanced.run_pre_training_bias(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    data_config=data_config_balanced, # Replace None
    data_bias_config=bias_config_balanced, # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    methods=['CI', 'DPL', 'KL', 'JS', 'LP', 'TVD', 'KS'],
    wait=False,
    logs=False)

In [None]:
run_balanced_bias_processing_job_name = (
    clarify_processor_balanced.latest_job.job_name)
print(run_balanced_bias_processing_job_name)

In [None]:
display(
    HTML(
        f'<b>Review <a target="blank" href="https://console.aws.amazon'
        f'.com/sagemaker/home?region={region}#/processing-jobs/'
        f'{run_balanced_bias_processing_job_name}">processing job</a></b>'
    ))
display(
    HTML(
        f'<b>Review <a target="blank" href="https://console.aws.amazon'
        f'.com/cloudwatch/home?region={region}#logStream:group=/aws/'
        'sagemaker/ProcessingJobs;'
        f'prefix={run_balanced_bias_processing_job_name};'
        f'streamFilter=typeLogStreamPrefix">CloudWatch logs</a> after '
        'about 5 minutes</b>'))

In [None]:
running_processor = (
    sagemaker.processing.ProcessingJob.from_processing_name(
        processing_job_name=run_balanced_bias_processing_job_name,
        sagemaker_session=sess))

In [None]:
%%time
running_processor.wait(logs=False)

In [None]:
!aws s3 ls $bias_report_balanced_output_path/

In [None]:
!aws s3 cp --recursive $bias_report_balanced_output_path ./generated_bias_report/balanced/

In [None]:
display(
    HTML(
        '<b>Review <a target="blank" href="./generated_bias_report/'
        'balanced/report.html">balanced bias report</a></b>'))

In [None]:
!aws s3 cp ./C1_W2_Assignment.ipynb s3://$bucket/C1_W2_Assignment_Learner.ipynb