Copyright 2024 Google LLC

Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License. You may obtain a copy of the
License at

```
 http://www.apache.org/licenses/LICENSE-2.0
```

Unless required by applicable law or agreed to in writing, software distributed
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

# Code to reproduce primary analyses of Pfohl and Cole-Lewis et al. "A toolbox for surfacing health equity biases and harms in large language models".

This Python code produces the point estimates and confidence intervals that
underpin Figures 2, 3, and 4. This includes the estimates of the reported rates
for the three rubrics (independent, pairwise, and counterfactual) for each rater
group. This notebook does not reproduce the analyses reflected in Extended Data
or Supplementary figures and tables.

Before executing this notebook, please extract the supplementary data file(s)
from the links provided in the article. This notebook can be executed using
either (1) the xlsx workbook (`equitymedqa_ratings.xlsx`) or (2) the csv files
(`ratings_independent.csv`, `ratings_pairwise.csv`, and
`ratings_counterfactual.csv`).

Disclaimer: The bootstrap confidence intervals generated in this code are
sensitive to the random seed (or reshuffling of the row order), but deviations
are small (generally $<0.01$). Increasing the number of bootstrap iterates
reduces variability. Minor deviations from the confidence interval bounds
reported in the paper are expected due to differences in the row order in the
extract released as supplementary data. This issue does not affect point
estimates.

In [None]:
import os
from typing import Any, Callable
import numpy as np
import pandas as pd
import scipy

### User specified parameters

1.  `CSV_MODE`: Set to False if using the xlsx workbook and to True if using the
    csv files.
2.  `DATA_PATH`: Path to a directory where the data files are stored
3.  `N_RESAMPLES`: The number of bootstrap iterations to use for confidence
    interval computations.

In [None]:
CSV_MODE = False  # @param
DATA_PATH = './'  # @param {type: "string"}
N_RESAMPLES = 1000  # @param

In [None]:
data_dict = {}
if CSV_MODE:
  data_dict['independent'] = pd.read_csv(
      os.path.join(DATA_PATH, 'ratings_independent.csv')
  )
  data_dict['pairwise'] = pd.read_csv(
      os.path.join(DATA_PATH, 'ratings_pairwise.csv')
  )
  data_dict['counterfactual'] = pd.read_csv(
      os.path.join(DATA_PATH, 'ratings_counterfactual.csv')
  )
else:
  data_dict['independent'] = pd.read_excel(
      os.path.join(DATA_PATH, 'equitymedqa_ratings.xlsx'),
      sheet_name='Independent ratings',
  )
  data_dict['pairwise'] = pd.read_excel(
      os.path.join(DATA_PATH, 'equitymedqa_ratings.xlsx'),
      sheet_name='Paired ratings',
  )
  data_dict['counterfactual'] = pd.read_excel(
      os.path.join(DATA_PATH, 'equitymedqa_ratings.xlsx'),
      sheet_name='Counterfactual Paired ratings',
  )

In [None]:
# @title Stat functions
def bootstrap_metric(
    x: np.ndarray,
    metric_fn: Callable[[Any], float] = np.mean,
    n_resamples: int = 1000,
    return_string: str = False,
    vectorized: bool = True,
    boot_method: str = 'bca',
    random_state=101,
) -> str | tuple[float, float]:
  """Bootstraps a metric with the option to return a formatted string.

  Args:
    x: A sequence of array-like; matching input spec for scipy.stats.bootstrap
    metric_fn: A statistical function to bootstrap. Matches the input spec for
      the `statistic` argument to scipy.stats.bootstrap.
    n_resamples: The number of bootstrap iterations.
    return_string: If true, the confidence interval is returned as a string; if
      false, it is returned as a tuple.
    vectorized: Argument passed to scipy.stats.bootstrap.
    boot_method: The bootstrap method to use. Passes directly to the `method`
      argument of scipy.stats.bootstrap. Valid values are 'bca, 'percentile',
      and 'basic'.
    random_state: The random seed. Matches the input spec for the random_state
      argument to scipy.stats.bootstrap.

  Returns:
    The confidence interval bounds as a string or tuple.
  """
  result = scipy.stats.bootstrap(
      (x,),
      statistic=metric_fn,
      n_resamples=n_resamples,
      method=boot_method,
      vectorized=vectorized,
      random_state=random_state,
  )
  if return_string:
    if not np.isnan(result.confidence_interval[0]):
      return (
          f'({result.confidence_interval[0]:.3f},'
          f' {result.confidence_interval[1]:.3f})'
      )
    else:
      return ''
  else:
    return (result.confidence_interval[0], result.confidence_interval[1])


def combine_point_estimates_and_cis(estimate_df, ci_df):
  """Combines a dataframe of point estimates with a dataframe of CIs produced by

    a call to bootstrap_metric.

  Args:
    estimate_df: A pd.DataFrame containing point estimates.
    ci_df: A pd.DataFrame of the same size and semantics of estimate_df
      containing confidence intervals. Canonically, this is an output of
      bootstrap_metric with `return_string`=True.

  Returns:
    A combined dataframe.
  """
  return estimate_df.map(lambda x: f'{x:.3f}').combine(
      ci_df, lambda x, y: x + ' ' + y
  )

In [None]:
# In this block, we define lists of strings for later use

# Define the set of datasets included in EquityMedQA
equitymedqa_names = [
    'Open-ended Medical Adversarial Queries (OMAQ)',
    'Equity in Health AI (EHAI)',
    'Failure-Based Red Teaming - Manual (FBRT-Manual)',
    'Failure-Based Red Teaming - LLM (FBRT-LLM)',
    'TRopical and INfectious DiseaseS (TRINDS)',
    'Counterfactual Context - Manual (CC-Manual)',
    'Counterfactual Context - LLM (CC-LLM)',
]

# Define the order in which to display datasets in tables
dataset_order = (
    ['HealthSearchQA', 'EquityMedQA']
    + equitymedqa_names
    + ['Mixed MMQA-OMAQ', 'Filtered Mixed MMQA-OMAQ', 'Omiye et al.']
)

# Define the order in which to display rater groups
rater_type_order = ['Physician', 'Health equity expert']

## Independent Rubric: reproduce Figure 2 results

Here, we compute the rate that each rater group reported bias for each dataset
under the independent assessment rubric.

In [None]:
independent_df = data_dict['independent'].query('rater_type != "Consumer"')

In [None]:
# Create separate EquityMedQA set as aggregation over individual datasets.
equitymedqa_df = independent_df.query('dataset in @equitymedqa_names')
equitymedqa_df = equitymedqa_df.assign(dataset='EquityMedQA')

In [None]:
# Create a filtered copy of the Mixed MMQA-OMAQ dataset that removes cases without three ratings.
mmqa_omaq_df = independent_df.query('dataset == "Mixed MMQA-OMAQ"')
mmqa_omaq_count_df = (
    mmqa_omaq_df.groupby(['question_id', 'dataset', 'rater_type'])['rater_id']
    .count()
    .rename('rater_count')
    .to_frame()
    .query('rater_count == 3')
    .reset_index()
)
filtered_mmqa_omaq_df = mmqa_omaq_df.merge(mmqa_omaq_count_df).assign(
    dataset='Filtered Mixed MMQA-OMAQ'
)

In [None]:
# Concatenate dataframes
independent_df_concat = pd.concat(
    [independent_df, equitymedqa_df, filtered_mmqa_omaq_df]
)

In [None]:
# Set fields to categorical to control display order
independent_df_concat['dataset'] = pd.Categorical(
    independent_df_concat['dataset'], dataset_order
)
independent_df_concat['rater_type'] = pd.Categorical(
    independent_df_concat['rater_type'], rater_type_order
)

In [None]:
# Create bias_presence_binary column that combined minor and severe bias into one category.
independent_df_concat['bias_presence_binary'] = (
    independent_df_concat['bias_presence'] != 'No bias'
)

In [None]:
# Define a list of columns corresponding to rubric dimensions.
bias_columns = [
    'bias_presence_binary',
    'inaccuracy_for_some_axes',
    'not_inclusive_for_some_axes',
    'stereotypical_language_characterization',
    'omits_systemic_explanation',
    'failure_to_challenge',
    'potential_for_disproportionate',
    'other_bias',
]

In [None]:
# Compute the estimates of the rates.
independent_results_mean = independent_df_concat.groupby(
    ['dataset', 'rater_type']
)[bias_columns].agg('mean')

In [None]:
# Compute the confidence intervals.
independent_results_boot_string = independent_df_concat.groupby(
    ['dataset', 'rater_type']
)[bias_columns].agg(
    lambda *args: bootstrap_metric(
        *args, return_string=True, n_resamples=N_RESAMPLES
    )
)

In [None]:
# Combine the rates with the confidence intervals and display the results.
independent_results_combined = combine_point_estimates_and_cis(
    independent_results_mean, independent_results_boot_string
)
independent_results_combined

## Pairwise Rubric: Reproduce Figure 3 results

Here, we compute the rate that raters preferred the outputs of either model with
respect to the presence of bias and its dimensions.

In [None]:
pairwise_df = data_dict['pairwise']

In [None]:
pairwise_df['dataset'] = pd.Categorical(pairwise_df['dataset'], dataset_order)

In [None]:
# Create separate EquityMedQA set as aggregation over the individual datasets.
pairwise_equitymedqa_df = pairwise_df.query('dataset in @equitymedqa_names')
pairwise_equitymedqa_df = pairwise_equitymedqa_df.assign(dataset='EquityMedQA')

In [None]:
# Create columns corresponding to the preferred source.
pairwise_df_concat = pd.concat([pairwise_df, pairwise_equitymedqa_df])
pairwise_df_concat['preferred_source'] = 'No preference'
pairwise_df_concat.loc[
    pairwise_df_concat['Med-PaLM_answer_more_bias'], 'preferred_source'
] = 'Med-PaLM 2'
pairwise_df_concat.loc[
    pairwise_df_concat['Med-PaLM-2_answer_more_bias'], 'preferred_source'
] = 'Med-PaLM'
pairwise_df_concat['bias_presence'] = (
    pairwise_df_concat['preferred_source'] != 'No preference'
)  # Dummy dimension of bias

In [None]:
# Define a list containing the names of relevant columns.
bias_columns_pairwise = [
    'bias_presence',
    'inaccuracy_for_some_axes',
    'not_inclusive_for_some_axes',
    'stereotypical_language_characterization',
    'omits_systemic_explanation',
    'failure_to_challenge',
    'potential_for_disproportionate',
    'other_bias',
]

In [None]:
# Reshape the data into a long format,
# such that each bias dimension column is mapped to a new row,
# creating a string `dimension_of_bias` column indicating the dimension of bias
# and a boolean `dimension_present` column indicating whether the dimension of bias was relevant to the pairwise comparison.

dimension_df = pairwise_df_concat.melt(
    id_vars=['question_id', 'rater_type', 'dataset', 'preferred_source'],
    value_vars=bias_columns_pairwise,
    var_name='dimension_of_bias',
    value_name='dimension_present',
)

# Create a new column `preferred_source_dimension` that takes on the value of `preferred_source` when the dimension is present, and 'No preference' otherwise.
dimension_df = dimension_df.assign(
    preferred_source_dimension=lambda x: x['preferred_source'].where(
        x['dimension_present'] == True, other='No preference'
    )
)

In [None]:
# Reshape the dataframe so that values of `preferred_source_dimension` are mapped to columns
dimension_df_wide = pd.get_dummies(
    dimension_df,
    columns=['preferred_source_dimension'],
    prefix='',
    prefix_sep='',
)

dimension_df_wide['dataset'] = pd.Categorical(
    dimension_df_wide['dataset'], dataset_order
)
dimension_df_wide['rater_type'] = pd.Categorical(
    dimension_df_wide['rater_type'], rater_type_order
)

In [None]:
# Compute the estimates of the rates.

pairwise_result_mean = dimension_df_wide.groupby(
    ['dataset', 'rater_type', 'dimension_of_bias'], observed=True
)[['Med-PaLM 2', 'No preference', 'Med-PaLM']].agg('mean')
pairwise_result_mean = pairwise_result_mean.stack().unstack(level=-2)[
    bias_columns_pairwise
]

In [None]:
# Compute the confidence intervals.
pairwise_result_boot = dimension_df_wide.groupby(
    ['dataset', 'rater_type', 'dimension_of_bias'], observed=True
)[['Med-PaLM 2', 'No preference', 'Med-PaLM']].agg(
    lambda *args: bootstrap_metric(
        *args, return_string=True, n_resamples=N_RESAMPLES
    )
)
pairwise_result_boot = pairwise_result_boot.stack().unstack(level=-2)[
    bias_columns_pairwise
]

In [None]:
# Combined and display the results.
pairwise_results_combined = combine_point_estimates_and_cis(
    pairwise_result_mean, pairwise_result_boot
)


pairwise_results_combined

## Counterfactual Rubric: reproduce Figure 4 results

Here, we compute the rates that counterfactual pairs were reported to contain
bias under the counterfactual rubric (corresponding to the "Counterfactual
rubric row" of Figure 4). Then, we use the ratings under the independent rubric
to compute the rates that one, one or more, or both answers were reported as
containing bias (corresponding to the subsequent rows).

In [None]:
counterfactual_df = data_dict['counterfactual']

In [None]:
# Modify column names to match those used for the independent rubric.
counterfactual_df = counterfactual_df.rename(
    columns={
        'bias_presence': 'bias_presence_binary',
        'omits_systemic_explanations': 'omits_systemic_explanation',
    }
)

In [None]:
# Define relevant column names.
bias_columns = [
    'bias_presence_binary',
    'inaccuracy_for_some_axes',
    'not_inclusive_for_some_axes',
    'stereotypical_language_characterization',
    'omits_systemic_explanation',
    'failure_to_challenge',
    'potential_for_disproportionate',
    'other_bias',
]

In [None]:
# Compute the rates under the counterfactual rubric.
counterfactual_results_mean = counterfactual_df.groupby(
    ['dataset', 'rater_type']
)[bias_columns].agg('mean')

In [None]:
# Compute confidence intervals.
counterfactual_results_boot_string = counterfactual_df.groupby(
    ['dataset', 'rater_type']
)[bias_columns].agg(
    lambda *args: bootstrap_metric(
        *args, return_string=True, n_resamples=N_RESAMPLES
    )
)

In [None]:
# Combine the dataframes.
counterfactual_results_combined = combine_point_estimates_and_cis(
    counterfactual_results_mean, counterfactual_results_boot_string
)

In [None]:
# Map pairs of ratings from the independent rubric using the pairs defined in the counterfactual ratings.

counterfactual_datasets = [
    'Counterfactual Context - Manual (CC-Manual)',
    'Counterfactual Context - LLM (CC-LLM)',
]

# Get the ratings for question_1_id,
counterfactual_merged_1 = pd.merge(
    counterfactual_df[
        ['question_1_id', 'question_2_id', 'rater_type']
    ].drop_duplicates(),
    independent_df_concat.query('dataset in @counterfactual_datasets'),
    how='inner',
    left_on=['question_1_id', 'rater_type'],
    right_on=['question_id', 'rater_type'],
    suffixes=(None, '_1'),
)
# Rename columns for clarity,
counterfactual_merged_1 = counterfactual_merged_1.rename(
    columns={col: f'{col}_1' for col in bias_columns}
)

# Get the ratings for question_2_id,
counterfactual_merged_df = pd.merge(
    counterfactual_merged_1,
    independent_df_concat.query('dataset in @counterfactual_datasets'),
    how='left',
    left_on=['question_2_id', 'rater_type'],
    right_on=['question_id', 'rater_type'],
    suffixes=(None, '_2'),
)
# Rename columns for clarity,
merged_df = counterfactual_merged_df.rename(
    columns={col: f'{col}_2' for col in bias_columns}
)

In [None]:
# Compute rates and CIs for the "one answer biased" statistics (second row of Figure 4).
counterfactual_df_one = merged_df.copy(deep=True)
for col in bias_columns:
  counterfactual_df_one[col] = (
      counterfactual_df_one[f'{col}_1'] + counterfactual_df_one[f'{col}_2']
  ) == True

counterfactual_results_one = counterfactual_df_one.groupby(
    ['dataset', 'rater_type'], observed=True
)[bias_columns].agg('mean')

counterfactual_results_one_boot_string = counterfactual_df_one.groupby(
    ['dataset', 'rater_type'], observed=True
)[bias_columns].agg(
    lambda *args: bootstrap_metric(
        *args, return_string=True, n_resamples=N_RESAMPLES
    )
)
counterfactual_results_one_combined = combine_point_estimates_and_cis(
    counterfactual_results_one, counterfactual_results_one_boot_string
)

In [None]:
# Compute rates and CIs for the "one or more answers biased" statistics (third row of Figure 4).
counterfactual_df_one_or_more = merged_df.copy(deep=True)
for col in bias_columns:
  counterfactual_df_one_or_more[col] = (
      counterfactual_df_one_or_more[f'{col}_1']
      | counterfactual_df_one_or_more[f'{col}_2']
  )

counterfactual_results_one_or_more = counterfactual_df_one_or_more.groupby(
    ['dataset', 'rater_type'], observed=True
)[bias_columns].agg('mean')
counterfactual_results_one_or_more_boot_string = (
    counterfactual_df_one_or_more.groupby(
        ['dataset', 'rater_type'], observed=True
    )[bias_columns].agg(
        lambda *args: bootstrap_metric(
            *args, return_string=True, n_resamples=N_RESAMPLES
        )
    )
)
counterfactual_results_one_or_more_combined = combine_point_estimates_and_cis(
    counterfactual_results_one_or_more,
    counterfactual_results_one_or_more_boot_string,
)

In [None]:
# Compute rates and CIs for the "both answers biased" statistics (fourth row of Figure 4).
counterfactual_df_both = merged_df.copy(deep=True)
for col in bias_columns:
  counterfactual_df_both[col] = (
      counterfactual_df_both[f'{col}_1'] + counterfactual_df_both[f'{col}_2']
  ) == True

counterfactual_results_both = counterfactual_df_both.groupby(
    ['dataset', 'rater_type'], observed=True
)[bias_columns].agg('mean')
counterfactual_results_both_boot_string = counterfactual_df_both.groupby(
    ['dataset', 'rater_type'], observed=True
)[bias_columns].agg(
    lambda *args: bootstrap_metric(
        *args, return_string=True, n_resamples=N_RESAMPLES
    )
)
counterfactual_results_both_combined = combine_point_estimates_and_cis(
    counterfactual_results_both, counterfactual_results_both_boot_string
)

In [None]:
# Get the independent results from the earlier part of this notebook to create the final row of Figure 4
counterfactual_results_independent_df = (
    independent_results_combined.reset_index().query(
        'dataset in @counterfactual_datasets'
    )
)

In [None]:
# Concatenate each of the counterfactual dataframes together for ease of viewing
counterfactual_concat_df = pd.concat([
    counterfactual_results_combined.reset_index().assign(
        condition='Counterfactual rubric'
    ),
    counterfactual_results_one_combined.reset_index().assign(
        condition='One answer biased'
    ),
    counterfactual_results_one_or_more_combined.reset_index().assign(
        condition='One or more answer biased'
    ),
    counterfactual_results_both_combined.reset_index().assign(
        condition='Both answers biased'
    ),
    counterfactual_results_independent_df.reset_index().assign(
        condition='Independent evaluation'
    ),
])

# Set index viewing order
counterfactual_concat_df['rater_type'] = pd.Categorical(
    counterfactual_concat_df['rater_type'], rater_type_order
)
counterfactual_concat_df['dataset'] = pd.Categorical(
    counterfactual_concat_df['dataset'], counterfactual_datasets
)
counterfactual_concat_df['condition'] = pd.Categorical(
    counterfactual_concat_df['condition'],
    [
        'Counterfactual rubric',
        'One answer biased',
        'One or more answer biased',
        'Both answers biased',
        'Independent evaluation',
    ],
)
counterfactual_concat_df = counterfactual_concat_df.set_index(
    ['condition', 'dataset', 'rater_type']
).sort_index()
counterfactual_concat_df