Copyright 2018 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Evaluation code


__Disclaimer__
*   This notebook contains experimental code, which may be changed without notice.
*   The ideas here are some ideas relevant to fairness - they are not the whole story!



# Notebook summary

This notebook intends to evaluate a list of models on two dimensions:
- "Performance": How well the model perform to classify the data (intended bias). Currently, we use the AUC.
- "Bias": How much bias does the model contain (unintended bias). Currently, we use the pinned auc.

This script takes the following steps:

- Write input function to generate 2 datasets:
    - a "performance dataset" which will be used for the first set of metrics. This dataset is supposed to be similar format to the training data (contain a piece of text and a label).
    - a "bias dataset" which will be used for the second set of metrics. This data contains a piece of text, a label but also some subgroup information to evaluate the unintended bias on.
- Runs predictions with the export_utils.
- Evaluate metrics.

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import getpass
import json
import nltk
import numpy as np
import pandas as pd
import pkg_resources
import os
import random
import re
import seaborn as sns

import tensorflow as tf
from tensorflow.python.lib.io import file_io

In [None]:
from utils_export.dataset import Dataset, Model
from utils_export import utils_cloudml
from utils_export import utils_tfrecords

In [None]:
os.environ['GCS_READ_CACHE_MAX_SIZE_MB'] = '0' #Faster to access GCS file + https://github.com/tensorflow/tensorflow/issues/15530

In [None]:
random.seed(2018)

# Settings

### Global variables

In [None]:
# User inputs
PROJECT_NAME = 'wikidetox'

# Information about deployed model.
MODEL_NAMES = ['tf_gru_attention_unbiasN_trainY:v_1537919376']
TEXT_FEATURE_NAME = 'comment_text' #Input text
SENTENCE_KEY = 'comment_key' #Input key
LABEL_NAME_PREDICTION_MODEL = 'frac_neg/logistic' # Output prediction

# Part 1: Creating input_fn

In [None]:
def tokenizer(text, lowercase=True):
  """Converts text to a list of words.

  Args:
    text: piece of text to tokenize (string).
    lowercase: whether to include lowercasing in preprocessing (boolean).
    tokenizer: Python function to tokenize the text on.

  Returns:
    A list of strings (words).
  """
  words = nltk.word_tokenize(text.decode('utf-8'))
  if lowercase:
    words = [w.lower() for w in words]
  return words

### Performance dataset

In [None]:
# User inputs
PERFORMANCE_DATASET = 'gs://kaggle-model-experiments/resources/toxicity_q42017_test.tfrecord'
LABEL_NAME_TEST_FILE = 'frac_neg' #Name of the label in the performance dataset

In [None]:
# Define features

# DECODING
decoding_input_features = {
  TEXT_FEATURE_NAME: tf.FixedLenFeature([], dtype=tf.string),
  LABEL_NAME_TEST_FILE: tf.FixedLenFeature([], dtype=tf.float32)
}

def input_fn_performance(max_n_examples=None, random_filter_keep_rate=1.0):
    res = utils_tfrecords.decode_tf_records_to_pandas(
        decoding_input_features,
        PERFORMANCE_DATASET,
        max_n_examples,
        random_filter_keep_rate)
    res[TEXT_FEATURE_NAME] = list(map(tokenizer, res[TEXT_FEATURE_NAME]))
    return res

### Synthetic comment bias dataset

In [None]:
!pip install -U -q git+https://github.com/conversationai/unintended-ml-bias-analysis@1de676a31de9e43892964f71d1e38e90fc8b331e

In [None]:
from unintended_ml_bias import model_bias_analysis

In [None]:
# Loading it from it the unintended_ml_bias github.
entire_test_bias_df = pd.read_csv(
    pkg_resources.resource_stream("unintended_ml_bias", "eval_datasets/bias_madlibs_77k.csv"))
entire_test_bias_df['raw_text'] = entire_test_bias_df['Text']
entire_test_bias_df['label'] = entire_test_bias_df['Label']
entire_test_bias_df['label'] = list(map(lambda x: x=='BAD', entire_test_bias_df['label']))
entire_test_bias_df = entire_test_bias_df[['raw_text', 'label']].copy()
terms = [line.strip()
         for line in pkg_resources.resource_stream("unintended_ml_bias", "bias_madlibs_data/adjectives_people.txt")]
model_bias_analysis.add_subgroup_columns_from_text(entire_test_bias_df, 'raw_text', terms)
# Add preprocessing
entire_test_bias_df['text'] = list(map(tokenizer, entire_test_bias_df['raw_text']))

In [None]:
def input_fn_bias(max_n_examples):
    if max_n_examples:
        res = entire_test_bias_df.sample(n=max_n_examples, random_state=2018)
    else:
        res = entire_test_bias_df
    res = res.copy(deep=True)
    res = res.rename(
        columns={
            'raw_text': TEXT_FEATURE_NAME
        })
    return res

### Civil comment bias dataset

Construction of this database such as:
    - we keep only examples that have identity labels (with rule: male >=0)
    - we apply the 'threshold_bias_civil' for each identity field
    - we select x% of the "background", i.e. examples that are 0 for each identify. 
        Indeed, as the background is dominant, we want to reduce the size of the test set.  

In [None]:
civil_path = 'gs://kaggle-model-experiments/resources/civil_comments_data/train.tfrecord'
threshold_bias_civil = 0.5

civil_comments_spec = {
    'comment_text': tf.FixedLenFeature([], dtype=tf.string),
    'id': tf.FixedLenFeature([], dtype=tf.string),
    'toxicity': tf.FixedLenFeature([], dtype=tf.float32),
    'severe_toxicity': tf.FixedLenFeature([], dtype=tf.float32),
    'obscene': tf.FixedLenFeature([], dtype=tf.float32),
    'sexual_explicit': tf.FixedLenFeature([], dtype=tf.float32),
    'identity_attack': tf.FixedLenFeature([], dtype=tf.float32),
    'insult': tf.FixedLenFeature([], dtype=tf.float32),
    'threat': tf.FixedLenFeature([], dtype=tf.float32),
    'toxicity_annotator_count': tf.FixedLenFeature([], dtype=tf.int64),
    'identity_annotator_count': tf.FixedLenFeature([], dtype=tf.int64),
    'male': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'female': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'transgender': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'other_gender': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'heterosexual': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'homosexual_gay_or_lesbian': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'bisexual': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'other_sexual_orientation': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'christian': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'jewish': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'muslim': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'hindu': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'buddhist': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'atheist': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'other_religion': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'black': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'white': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'asian': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'latino': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'other_race_or_ethnicity': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'physical_disability': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'intellectual_or_learning_disability': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'psychiatric_or_mental_illness': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
    'other_disability': tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.),
}

identity_terms_civil = ['male', 'female', 'transgender', 'other_gender', 'heterosexual', 
                        'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation',
                        'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist',
                        'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity',
                        'physical_disability', 'intellectual_or_learning_disability',
                        'psychiatric_or_mental_illness', 'other_disability'
                       ]

def filter_fn_civil(example, background_filter_keep_rate=0.1):
    if example ['male'] < 0.:
        return False
    contains_one_identity = False
    for _term in identity_terms_civil:
        if example[_term] >= threshold_bias_civil:
            contains_one_identity = True
    if contains_one_identity:
        return True
    else:
        return (random.random() < background_filter_keep_rate)

def input_fn_bias_civil(max_n_examples=None, random_filter_keep_rate=1.0):
    civil_df_raw = utils_tfrecords.decode_tf_records_to_pandas(
        civil_comments_spec,
        civil_path,
        max_n_examples=max_n_examples,
        filter_fn=filter_fn_civil,
    )
    civil_df_raw[TEXT_FEATURE_NAME] = list(map(tokenizer, civil_df_raw[TEXT_FEATURE_NAME]))
    for _term in identity_terms_civil:
        civil_df_raw[_term] = list(map(lambda x : x >= threshold_bias_civil, list(civil_df_raw[_term])))
    civil_df_raw['toxicity'] = list(map(lambda x :bool(round(x)), list(civil_df_raw['toxicity'])))
    civil_df_raw = civil_df_raw.rename(columns={
        'comment_text': TEXT_FEATURE_NAME,
        'toxicity': 'label'})
    res = civil_df_raw.copy(deep=True)
    return res

# Part 2: Running prediction

### Defining the model

In [None]:
# User inputs.
model_input_spec = {
    TEXT_FEATURE_NAME: utils_tfrecords.EncodingFeatureSpec.LIST_STRING} #library will use this automatically

model = Model(
    feature_keys_spec=model_input_spec,
    prediction_keys=LABEL_NAME_PREDICTION_MODEL,
    example_key=SENTENCE_KEY,
    model_names=MODEL_NAMES,
    project_name=PROJECT_NAME)

### Performance dataset

In [None]:
# User inputs
SIZE_PERFORMANCE_DATA_SET = 10000

# Pattern for path of tf_records
PERFORMANCE_DATASET_DIR = os.path.join(
    'gs://kaggle-model-experiments/',
    getpass.getuser(),
    'tfrecords/test_performance')

In [None]:
dataset_performance = Dataset(input_fn_performance, PERFORMANCE_DATASET_DIR)
dataset_performance.load_data(SIZE_PERFORMANCE_DATA_SET, random_filter_keep_rate=0.5)

In [None]:
dataset_performance.add_model_prediction_to_data(model)

### Bias dataset

In [None]:
# User inputs
SIZE_BIAS_DATA_SET = 20000

# Pattern for path of tf_records
BIAS_DATASET_DIR = os.path.join(
    'gs://kaggle-model-experiments/',
    getpass.getuser(),
    'tfrecords/civil_bias_performance')

In [None]:
dataset_bias = Dataset(input_fn_bias_civil, BIAS_DATASET_DIR)
dataset_bias.load_data(SIZE_BIAS_DATA_SET)

In [None]:
dataset_bias.add_model_prediction_to_data(model)

### Post processing

In [None]:
# Setting the table to match the required format.
test_performance_df = dataset_performance.show_data()
test_performance_df = test_performance_df.rename(
    columns={
        TEXT_FEATURE_NAME: 'raw_text',
        LABEL_NAME_TEST_FILE: 'label'
    })
test_performance_df['label'] = list(map(lambda x :bool(round(x)), list(test_performance_df['label'])))

In [None]:
test_bias_df = dataset_bias.show_data()

### Analyzing final results

In [None]:
test_performance_df.head()

In [None]:
test_bias_df.head()

# Part 3: Run evaluation metrics

In [None]:
MODEL_FAMILIES = [MODEL_NAMES]

## Performance metrics

### Data Format

At this point, our performance data is in DataFrame df, with columns:

text: Full text of the comment.
label: True if the comment is Toxic, False otherwise.
< model name >: One column per model, cells contain the score from that model.
You can run the analysis below on any data in this format. Subgroup labels can be generated via words in the text as done above, or come from human labels if you have them.

### Run AUC

In [None]:
import sklearn.metrics as metrics

In [None]:
for model_family in MODEL_FAMILIES:
  auc_list = []
  for model in model_family:
    fpr, tpr, thresholds = metrics.roc_curve(
        test_performance_df['label'],
        test_performance_df[model])
    auc_list.append(metrics.auc(fpr, tpr))
  print ('Auc for model {}: {}'.format(model, np.mean(auc_list)))

## Unintended Bias Metrics

### Data Format
At this point, our bias data is in DataFrame df, with columns:

*   label: True if the comment is Toxic, False otherwise.
*   < model name >: One column per model, cells contain the score from that model.
*   < subgroup >: One column per identity, True if the comment mentions this identity.

You can run the analysis below on any data in this format. Subgroup labels can be 
generated via words in the text as done above, or come from human labels if you have them.


### Pinned AUC
Pinned AUC measures the extent of unintended bias of a real-value score function
by measuring each sub-group's divergence from the general distribution.

Let $D$ represent the full data set and $D_g$ be the set of examples in subgroup
$g$. Then:


$$ Pinned \ dataset \ for \ group \ g = pD_g = s(D_g) + s(D), |s(D_g)| = |s(D)| $$

$$ Pinned \ AUC \ for \ group \ g = pAUC_g = AUC(pD_g) $$

$$ Pinned \ AUC \ Squared \ Equality \ Difference = \Sigma_{g \in G}(AUC - pAUC_g)^2 $$


### Pinned AUC Equality Difference
The table below shows the pinned AUC equality difference for each model family.
Lower scores (lighter red) represent more similarity between each group's pinned AUC, which means
less unintended bias.

On this set, the wiki_debias_cnn model demonstrates least unintended bias. 

In [None]:
cm = sns.light_palette("red", as_cmap=True)

In [None]:
identity_terms_civil_included = []
for _term in identity_terms_civil:
    if sum(test_bias_df[_term]) >= 20:
        print ('keeping {}'.format(_term))
        identity_terms_civil_included.append(_term)

In [None]:
test_bias_df = test_bias_df.rename(columns =
    {
        'comment_text': TEXT_FEATURE_NAME,
        'toxicity': 'label'})

In [None]:
eq_diff = model_bias_analysis.per_subgroup_auc_diff_from_overall(
    test_bias_df, identity_terms_civil_included, MODEL_FAMILIES, squared_error=True) 
# sort to guarantee determi7nistic output
eq_diff.sort_values(by=['model_family'], inplace=True)
eq_diff.reset_index(drop=True, inplace=True)
eq_diff.style.background_gradient(cmap=cm)

### Pinned AUC Graphs
The graphs below show per-group Pinned AUC for each subgroup and each model. Each
identity group shows 3 points, each representing the pinned AUC for one training 
version of the model. More consistency among the values represents less unintended bias.

In [None]:
pinned_auc_results = model_bias_analysis.per_subgroup_aucs(test_bias_df, identity_terms_civil_included, MODEL_FAMILIES, 'label')
for family in MODEL_FAMILIES:
  name = model_bias_analysis.model_family_name(family)
  model_bias_analysis.per_subgroup_scatterplots(
      pinned_auc_results,
      'subgroup',
      name + '_aucs',
      name + ' Pinned AUC',
      y_lim=(0., 1.0))