Copyright 2018 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Evaluation code


__Disclaimer__
*   This notebook contains experimental code, which may be changed without notice.
*   The ideas here are some ideas relevant to fairness - they are not the whole story!



# Notebook summary

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import getpass
import json
import nltk
import numpy as np
import pandas as pd
import pkg_resources
import os
import random
import re
import seaborn as sns

import tensorflow as tf
from tensorflow.python.lib.io import file_io

In [None]:
from utils_export.dataset import Dataset, Model
from utils_export import utils_cloudml
from utils_export import utils_tfrecords

In [None]:
os.environ['GCS_READ_CACHE_MAX_SIZE_MB'] = '0' #Faster to access GCS file + https://github.com/tensorflow/tensorflow/issues/15530

# Settings

### Global variables

In [None]:
# User inputs
PROJECT_NAME = 'wikidetox'

TEXT_FEATURE_NAME = 'comment_text' #Input text
SENTENCE_KEY = 'comment_key' #Input key
LABEL_NAME_PREDICTION_MODEL = 'frac_neg/logistic' # Output prediction

# Part 1: Creating input_fn

In [None]:
def tokenizer(text, lowercase=True):
  """Converts text to a list of words.

  Args:
    text: piece of text to tokenize (string).
    lowercase: whether to include lowercasing in preprocessing (boolean).
    tokenizer: Python function to tokenize the text on.

  Returns:
    A list of strings (words).
  """
  words = nltk.word_tokenize(text.decode('utf-8'))
  if lowercase:
    words = [w.lower() for w in words]
  return words

### Performance dataset

In [None]:
# User inputs
PERFORMANCE_DATASET = 'gs://kaggle-model-experiments/resources/toxicity_q42017_test.tfrecord'
LABEL_NAME_TEST_FILE = 'frac_neg' #Name of the label in the performance dataset

In [None]:
# Define features

# DECODING
decoding_input_features = {
  TEXT_FEATURE_NAME: tf.FixedLenFeature([], dtype=tf.string),
  LABEL_NAME_TEST_FILE: tf.FixedLenFeature([], dtype=tf.float32)
}

def input_fn_performance(max_n_examples=None, random_filter_keep_rate=1.0):
    res = utils_tfrecords.decode_tf_records_to_pandas(
        decoding_input_features,
        PERFORMANCE_DATASET,
        max_n_examples,
        random_filter_keep_rate)
    res[TEXT_FEATURE_NAME] = list(map(tokenizer, res[TEXT_FEATURE_NAME]))
    return res

### Bias dataset

In [None]:
!pip install -U -q git+https://github.com/conversationai/unintended-ml-bias-analysis@1de676a31de9e43892964f71d1e38e90fc8b331e

In [None]:
from unintended_ml_bias import model_bias_analysis

In [None]:
# Loading it from it the unintended_ml_bias github.
entire_test_bias_df = pd.read_csv(
    pkg_resources.resource_stream("unintended_ml_bias", "eval_datasets/bias_madlibs_77k.csv"))
entire_test_bias_df['raw_text'] = entire_test_bias_df['Text']
entire_test_bias_df['label'] = entire_test_bias_df['Label']
entire_test_bias_df['label'] = list(map(lambda x: x=='BAD', entire_test_bias_df['label']))
entire_test_bias_df = entire_test_bias_df[['raw_text', 'label']].copy()
terms = [line.strip()
         for line in pkg_resources.resource_stream("unintended_ml_bias", "bias_madlibs_data/adjectives_people.txt")]
model_bias_analysis.add_subgroup_columns_from_text(entire_test_bias_df, 'raw_text', terms)
# Add preprocessing
entire_test_bias_df['text'] = list(map(tokenizer, entire_test_bias_df['raw_text']))

In [None]:
def input_fn_bias(max_n_examples):
    if max_n_examples:
        res = entire_test_bias_df.sample(n=max_n_examples, random_state=2018)
    else:
        res = entire_test_bias_df
    res = res.copy(deep=True)
    res = res.rename(
        columns={
            'text': TEXT_FEATURE_NAME
        })
    return res

# Part 2: Running prediction

### Defining the model

In [None]:
MODEL_NAMES = [
    'tf_gru_attention_continuous:v_1537828514',
    'tf_gru_attention_continuous:v_1537828537',
    'tf_gru_attention_continuous:v_1537828585',
    'tf_gru_attention_continuous:v_1537828630',
    'tf_gru_attention_continuous:v_1537828675',
    'tf_gru_attention_continuous:v_1537828722',
    'tf_gru_attention_continuous:v_1537828745',
]
               

In [None]:
# User inputs.
model_input_spec = {
    TEXT_FEATURE_NAME: utils_tfrecords.EncodingFeatureSpec.LIST_STRING} #library will use this automatically

model = Model(
    feature_keys_spec=model_input_spec,
    prediction_keys=LABEL_NAME_PREDICTION_MODEL,
    example_key=SENTENCE_KEY,
    model_names=MODEL_NAMES,
    project_name=PROJECT_NAME)

### Performance dataset

In [None]:
# User inputs
SIZE_PERFORMANCE_DATA_SET = 10000

# Pattern for path of tf_records
TF_RECORD_PERFORMANCE_PATTERN = os.path.join(
    'gs://kaggle-model-experiments/',
    getpass.getuser(),
    'tfrecords/test_performance')

In [None]:
dataset_performance = Dataset(input_fn_performance)
dataset_performance.load_data(SIZE_PERFORMANCE_DATA_SET, random_filter_keep_rate=0.5)

In [None]:
dataset_performance.add_model_prediction_to_data(model, tf_record_path_pattern=TF_RECORD_PERFORMANCE_PATTERN)

### Bias dataset

In [None]:
# User inputs
SIZE_BIAS_DATA_SET = None

# Pattern for path of tf_records
TF_RECORD_BIAS_PATTERN = os.path.join(
    'gs://kaggle-model-experiments/',
    getpass.getuser(),
    'tfrecords/bias_performance')

In [None]:
dataset_bias = Dataset(input_fn_bias)
dataset_bias.load_data(SIZE_BIAS_DATA_SET)

In [None]:
dataset_bias.add_model_prediction_to_data(model, tf_record_path_pattern=TF_RECORD_BIAS_PATTERN)

In [None]:
dataset_bias.show_data().head()

In [None]:
dataset_performance.show_data().head()

### Post processing

In [None]:
# Setting the table to match the required format.
test_performance_df = dataset_performance.show_data()
test_performance_df = test_performance_df.rename(
    columns={
        #TEXT_FEATURE_NAME: 'raw_text',
        LABEL_NAME_TEST_FILE: 'label'
    })
test_performance_df['label'] = list(map(lambda x :bool(round(x)), list(test_performance_df['label'])))

In [None]:
test_bias_df = dataset_bias.show_data()

### Analyzing final results

# Part 3: Run evaluation metrics

## Performance metrics

### Data Format

At this point, our performance data is in DataFrame df, with columns:

text: Full text of the comment.
label: True if the comment is Toxic, False otherwise.
< model name >: One column per model, cells contain the score from that model.
You can run the analysis below on any data in this format. Subgroup labels can be generated via words in the text as done above, or come from human labels if you have them.

### Run AUC

In [None]:
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
for model_family in [MODEL_NAMES]:
  auc_list = []
  for _model in model_family:
    fpr, tpr, thresholds = metrics.roc_curve(
        test_performance_df['label'],
        test_performance_df[_model]
    )
    auc_model = metrics.auc(fpr, tpr)
    auc_list.append(auc_model)
    print ('Auc for model {}: {}'.format(_model, auc_model))

In [None]:
for model_family in [MODEL_NAMES]:
  auc_list = []
  for _model in model_family:
    fpr, tpr, thresholds = metrics.roc_curve(
        test_bias_df['label'],
        test_bias_df[_model]
    )
    auc_model = metrics.auc(fpr, tpr)
    auc_list.append(auc_model)
    print ('Auc for model {}: {}'.format(_model, auc_model))

## Unintended Bias Metrics

In [None]:
cm = sns.light_palette("red", as_cmap=True)
plt.style.use(u'ggplot')

In [None]:
timestamps = []
performance_aucs = []
pinned_auc_equality_differences = []
pinned_auc_subgroup_males = []
pinned_auc_subgroup_females = []
pinned_auc_subgroup_males_vs_females = []
for _model in MODEL_NAMES:
    
    # Get timestamp
    timestamp = _model.split(':')[1].replace('v_', '')
    timestamps.append(timestamp)
    
    # Auc
    fpr, tpr, thresholds = metrics.roc_curve(
        test_performance_df['label'],
        test_performance_df[_model]
    )
    auc_model = metrics.auc(fpr, tpr)
    performance_aucs.append(auc_model)
    
    # Pinned  AUC equality difference
    pinned_auc_equality_difference = model_bias_analysis.per_subgroup_auc_diff_from_overall(
        test_bias_df, terms, [[_model]], squared_error=False)
    pinned_auc_equality_differences.append(pinned_auc_equality_difference['pinned_auc_equality_difference'].values[0])
    
    # Pinned Auc for subgroups
    pinned_auc_subgroup_male = model_bias_analysis.per_subgroup_aucs(
        test_bias_df, ['male'], [[_model]], 'label')
    pinned_auc_subgroup_males.append(pinned_auc_subgroup_male[_model + '_aucs'].values[0][0])
    pinned_auc_subgroup_female = model_bias_analysis.per_subgroup_aucs(
        test_bias_df, ['female'], [[_model]], 'label')
    pinned_auc_subgroup_females.append(pinned_auc_subgroup_female[_model + '_aucs'].values[0][0])
    
    pinned_auc_subgroup_males_vs_female = pinned_auc_equality_difference['pinned_auc_equality_difference'].values[0] - pinned_auc_subgroup_female[_model + '_aucs'].values[0][0]
    pinned_auc_subgroup_males_vs_females.append(pinned_auc_subgroup_males_vs_female)

In [None]:
visualization_dataframe = pd.DataFrame({
    'timestamp': timestamps,
    'auc': performance_aucs,
    'pinned_auc_eq_diff': pinned_auc_equality_differences,
    'pinned_auc_sub_males': pinned_auc_subgroup_males,
    'pinned_auc_sub_females': pinned_auc_subgroup_females,
    'pinned_auc_subg_males_vs_female': pinned_auc_subgroup_males_vs_females,
})
visualization_dataframe = visualization_dataframe.sort_values(['timestamp'], ascending=True)

In [None]:
plt.figure(figsize=(13, 2))
plt.plot(visualization_dataframe['timestamp'], visualization_dataframe['auc'], label='auc', color='k')
plt.legend(prop={'size': 10}, loc=4)
plt.show()

plt.figure(figsize=(13, 2))
plt.plot(visualization_dataframe['timestamp'], visualization_dataframe['pinned_auc_eq_diff'], label='pinned_auc_eq_diff', color='r')
plt.legend(prop={'size': 10}, loc=4)
plt.show()

plt.figure(figsize=(13, 2))
plt.plot(visualization_dataframe['timestamp'], visualization_dataframe['pinned_auc_sub_males'], label='pinned_auc_sub_males', color='g')
plt.legend(prop={'size': 10}, loc=4)
plt.show()

plt.figure(figsize=(13, 2))
plt.plot(visualization_dataframe['timestamp'], visualization_dataframe['pinned_auc_sub_females'], label='pinned_auc_sub_females', color='b')
plt.legend(prop={'size': 10}, loc=4)
plt.show()


plt.figure(figsize=(13, 2))
plt.plot(visualization_dataframe['timestamp'], visualization_dataframe['pinned_auc_subg_males_vs_female'], label='pinned_auc_subg_males_vs_female', color='c')
plt.legend(prop={'size': 10}, loc=4)
plt.show()