# Bias Evaluation for TF Javascript Model

Based on the [FAT* Tutorial Measuring Unintended Bias in Text Classification Models with Real Data](https://github.com/conversationai/unintended-ml-bias-analysis/blob/master/presentations/FAT_star_tutorial.md).

Copyright 2019 Google LLC.
SPDX-License-Identifier: Apache-2.0

In [0]:
!pip3 install --quiet "tensorflow>=1.11"
!pip3 install --quiet sentencepiece

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import sentencepiece
from google.colab import auth
from IPython.display import HTML, display

from sklearn import metrics

%matplotlib inline

# autoreload makes it easier to interactively work on code in imported libraries
%load_ext autoreload
%autoreload 2

# Set pandas display options so we can read more of the comment text.
pd.set_option('max_colwidth', 300)

# Seed for Pandas sampling, to get consistent sampling results
RANDOM_STATE = 123456789

In [0]:
auth.authenticate_user()

In [0]:
# Download and unzip files used in this colab
!curl -O -J -L https://storage.googleapis.com/civil_comments/fat_star_tutorial/fat-star.zip
!unzip -o fat-star.zip

In [0]:
!mkdir -p tfjs_model
!gsutil -m cp -R gs://conversationai-public/public_models/tfjs/v1/* tfjs_model

In [0]:
test_df_float = pd.read_csv('public_test.csv')
print('test data has %d rows' % len(test_df_float))


We will need to convert toxicity and identity columns to booleans, in order to work with our neural net and metrics calculcations.  For this tutorial, we will consider any value >= 0.5 as True (i.e. a comment should be considered toxic if 50% or more crowd raters labeled it as toxic).  Note that this code also converts missing identity fields to False.

In [0]:
# List all identities
identity_columns = [
    'male', 'female', 'transgender', 'other_gender', 'heterosexual',
    'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation', 'christian',
    'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion', 'black',
    'white', 'asian', 'latino', 'other_race_or_ethnicity',
    'physical_disability', 'intellectual_or_learning_disability',
    'psychiatric_or_mental_illness', 'other_disability']

def convert_to_bool(df, col_name):
  df[col_name] = np.where(df[col_name] >= 0.5, True, False)

def convert_dataframe_to_bool(df):
  bool_df = df.copy()
  for col in ['toxicity'] + identity_columns:
      convert_to_bool(bool_df, col)
  return bool_df

test_df = convert_dataframe_to_bool(test_df_float)

## Score test set with our text classification model

Using our new model, we can score the set of test comments for toxicity.


In [0]:
TOXICITY_COLUMN = 'toxicity'
TEXT_COLUMN = 'comment_text'

In [0]:
predict_fn = tf.contrib.predictor.from_saved_model(
  'tfjs_model', signature_def_key='predict')

In [0]:
sp = sentencepiece.SentencePieceProcessor()
sp.Load('tfjs_model/assets/universal_encoder_8k_spm.model')

In [0]:
def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

In [0]:
tox_scores = []
nrows = test_df.shape[0]
out = display(progress(0, nrows), display_id=True)
for offset in range(0, nrows):
  out.update(progress(offset, nrows))
  values = sp.EncodeAsIds(test_df[TEXT_COLUMN][offset])
  tox_scores.append(predict_fn({
      'values': values,
      'indices': [(0, i) for i in range(len(values))],
      'dense_shape': [1, len(values)]})['toxicity/probabilities'][0,1])

In [0]:
MODEL_NAME = 'fat_star_tutorial'
test_df[MODEL_NAME] = tox_scores

# Evaluate the overall ROC-AUC

This calculates the models performance on the entire test set using the ROC-AUC metric.

In [0]:
def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

calculate_overall_auc(test_df, MODEL_NAME)

In [0]:
# Get a list of identity columns that have >= 100 True records.  This will remove groups such
# as "other_disability" which do not have enough records to calculate meaningful metrics.
identities_with_over_100_records = []
for identity in identity_columns:
    num_records = len(test_df.query(identity + '==True'))
    if num_records >= 100:
        identities_with_over_100_records.append(identity)

SUBGROUP_AUC = 'subgroup_auc'
BACKGROUND_POSITIVE_SUBGROUP_NEGATIVE_AUC = 'background_positive_subgroup_negative_auc'
BACKGROUND_NEGATIVE_SUBGROUP_POSITIVE_AUC = 'background_negative_subgroup_positive_auc'

def compute_auc(y_true, y_pred):
  try:
    return metrics.roc_auc_score(y_true, y_pred)
  except ValueError:
    return np.nan


def compute_subgroup_auc(df, subgroup, label, model_name):
  subgroup_examples = df[df[subgroup]]
  return compute_auc(subgroup_examples[label], subgroup_examples[model_name])


def compute_background_positive_subgroup_negative_auc(df, subgroup, label, model_name):
  """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
  subgroup_negative_examples = df[df[subgroup] & ~df[label]]
  non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
  examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
  return compute_auc(examples[label], examples[model_name])


def compute_background_negative_subgroup_positive_auc(df, subgroup, label, model_name):
  """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
  subgroup_positive_examples = df[df[subgroup] & df[label]]
  non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
  examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
  return compute_auc(examples[label], examples[model_name])


def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
  """Computes per-subgroup metrics for all subgroups and one model."""
  records = []
  for subgroup in subgroups:
    record = {
        'subgroup': subgroup,
        'subgroup_size': len(dataset[dataset[subgroup]])
    }
    record[SUBGROUP_AUC] = compute_subgroup_auc(
        dataset, subgroup, label_col, model)
    record[BACKGROUND_POSITIVE_SUBGROUP_NEGATIVE_AUC] = compute_background_positive_subgroup_negative_auc(
        dataset, subgroup, label_col, model)
    record[BACKGROUND_NEGATIVE_SUBGROUP_POSITIVE_AUC] = compute_background_negative_subgroup_positive_auc(
        dataset, subgroup, label_col, model)
    records.append(record)
  return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

bias_metrics_df = compute_bias_metrics_for_model(test_df, identities_with_over_100_records, MODEL_NAME, TOXICITY_COLUMN)

# Plot a heatmap of bias metrics

Plot a heatmap of the bias metrics.  Higher scores indicate better results.
* Subgroup AUC measures the ability to separate toxic and non-toxic comments for this identity.
* Negative cross AUC measures the ability to separate non-toxic comments for this identity from toxic comments from the background distribution.
* Positive cross AUC measures the ability to separate toxic comments for this identity from non-toxic comments from the background distribution.

In [0]:
def plot_auc_heatmap(bias_metrics_results, models):
  metrics_list = [SUBGROUP_AUC, BACKGROUND_POSITIVE_SUBGROUP_NEGATIVE_AUC, BACKGROUND_NEGATIVE_SUBGROUP_POSITIVE_AUC]
  df = bias_metrics_results.set_index('subgroup')
  columns = []
  vlines = [i * len(models) for i in range(len(metrics_list))]
  for metric in metrics_list:
    for model in models:
      columns.append(metric)
  num_rows = len(df)
  num_columns = len(columns)
  fig = plt.figure(figsize=(num_columns, 0.5 * num_rows))
  ax = sns.heatmap(df[columns], annot=True, fmt='.2', cbar=True, cmap='Reds_r',
                   vmin=0.5, vmax=1.0)
  ax.xaxis.tick_top()
  plt.xticks(rotation=90)
  ax.vlines(vlines, *ax.get_ylim())
  return ax

plot_auc_heatmap(bias_metrics_df, [MODEL_NAME])

# Plot histograms showing comment scores

We can graph a histogram of comment scores in each identity.  In the following graphs, the X axis represents the toxicity score given by our new model, and the Y axis represents the comment count.  Blue values are comment whose true label is non-toxic, while red values are those whose true label is toxic.

In [0]:
def plot_histogram(non_toxic_scores, toxic_scores, description):
  NUM_BINS=10
  sns.distplot(non_toxic_scores, norm_hist=True, bins=NUM_BINS, color="skyblue", label='non-toxic ' + description, kde=False)
  ax = sns.distplot(toxic_scores, norm_hist=True, bins=NUM_BINS, color="red", label='toxic ' + description, kde=False)
  ax.set(xlabel='model toxicity score', ylabel='relative % of comments', yticklabels=[])
  plt.legend()
  plt.figure()

# Plot toxicity distributions of different identities to visualize bias.
def plot_histogram_for_identity(df, identity):
  toxic_scores = df.query(identity + ' == True & toxicity == True')[MODEL_NAME]
  non_toxic_scores = df.query(identity + ' == True & toxicity == False')[MODEL_NAME]
  plot_histogram(non_toxic_scores, toxic_scores, 'labeled for ' + identity)

def plot_background_histogram(df):
  toxic_scores = df.query('toxicity == True')[MODEL_NAME]
  non_toxic_scores = df.query('toxicity == False')[MODEL_NAME]
  plot_histogram(non_toxic_scores, toxic_scores, 'for all test data')

# Plot the histogram for the background data, and for a few identities
plot_background_histogram(test_df)
plot_histogram_for_identity(test_df, 'heterosexual')
plot_histogram_for_identity(test_df, 'transgender')
plot_histogram_for_identity(test_df, 'homosexual_gay_or_lesbian')
plot_histogram_for_identity(test_df, 'atheist')
plot_histogram_for_identity(test_df, 'christian')
plot_histogram_for_identity(test_df, 'asian')