## Import statements

In [3]:
import pandas as pd
import numpy as np
import sklearn.metrics as skm

## Function definitions

### locate_thresholds(df)
locate the threshold that yields the best accuracy

In [4]:
def locate_thresholds(df):
  thresholds = []

  for i in range(len(df.eps.unique())):
    epsilon = df.eps.unique()[i]

    df_eps = df[df.eps == epsilon]
    acc_one_eps = []
    range_1 = np.linspace(0.1, 1, 10)
    for threshold in range_1:
      true_all = df_eps.true.to_numpy()
      pred_all = df_eps.pred.apply(lambda x: 1 if x >= threshold else 0).to_numpy()
      acc_one_eps.append(skm.accuracy_score(true_all, pred_all))
    max_val = max(acc_one_eps)
    max_index = acc_one_eps.index(max_val)

    range_2 = np.linspace(range_1[max_index], range_1[max_index]+0.1, 11)
    acc_one_eps = []
    for threshold in range_2:
      true_all = df_eps.true.to_numpy()
      pred_all = df_eps.pred.apply(lambda x: 1 if x >= threshold else 0).to_numpy()
      acc_one_eps.append(skm.accuracy_score(true_all, pred_all))
    max_val = max(acc_one_eps)
    max_index = acc_one_eps.index(max_val)

    range_3 = np.linspace(range_2[max_index], range_2[max_index]+0.01, 11)
    acc_one_eps = []
    for threshold in range_3:
      true_all = df_eps.true.to_numpy()
      pred_all = df_eps.pred.apply(lambda x: 1 if x >= threshold else 0).to_numpy()
      acc_one_eps.append(skm.accuracy_score(true_all, pred_all))
    max_val = max(acc_one_eps)
    max_index = acc_one_eps.index(max_val)

    thresholds.append(range_3[max_index])
    print('eps: {:.3f}, best threshold: {:.3f}, accuracy: {:.3f}'.format(epsilon, range_3[max_index], max_val))
    
  eps_arr = df.eps.unique()
  th_arr = np.array(thresholds)
  eps_th_dict = dict(zip(eps_arr, th_arr))    
  return eps_th_dict

### expand_dataframe(df, eps_th_dict)
add five new columns:
1. `pred_binary`: binary prediction (0 or 1); transformed from raw predictions (probabilities) based on the optimal threshold
2. `tn`: true negative
3. `fp`: false positive
4. `fn`: false negative
5. `tp`: true positive

rename two columns for consistency:
1. `race_white` to `race`
2. `ethni` to `eth`

In [1]:
def expand_dataframe(df, eps_th_dict):
  df['pred_binary'] = df.apply(lambda row: 1 if row.pred >= eps_th_dict[row.eps] else 0, axis=1)

  df['tn'] = df.apply(lambda row: 1 if (row.true == 0 and row.pred_binary == 0) else 0, axis=1)
  df['fp'] = df.apply(lambda row: 1 if (row.true == 0 and row.pred_binary == 1) else 0, axis=1)
  df['fn'] = df.apply(lambda row: 1 if (row.true == 1 and row.pred_binary == 0) else 0, axis=1)
  df['tp'] = df.apply(lambda row: 1 if (row.true == 1 and row.pred_binary == 1) else 0, axis=1)

  df.rename(columns={'race_white': 'race'}, inplace=True)
  df.rename(columns={"ethni": "eth"}, inplace=True)

  return df

### create_results_csv(df)
store cumulative results in a small csv file

The small csv file contains one row for each epsilon value. Each row contains the mean and variance of the AUC value and three Disparate Impact values (Approval DI, False Negative DI, False Positive DI).

In [None]:
def create_results_csv(df):
  # eps and runs
  eps_arr = df.eps.unique()
  runs_arr = df.run.unique()

  # auc column
  auc_over_eps = []
  for ith_eps in eps_arr:
    df_ith_eps = df[df.eps == ith_eps]
    auc_over_runs = []

    for jth_run in runs_arr:
      df_ith_eps_jth_run = df_ith_eps[df_ith_eps.run == jth_run]

      true = df_ith_eps_jth_run.true.to_numpy()
      prob = df_ith_eps_jth_run.pred.to_numpy()

      auc_over_runs.append(skm.roc_auc_score(true, prob))

    auc_over_eps.append(np.nanmean(auc_over_runs))

  # di columns
  metrics =['approval', 'fn', 'fp']
  df_names = ['pred_binary', 'fn', 'fp']

  mean_columns = []
  mean_column_names = []

  var_columns = []
  var_column_names = []

  for metric_index in range(len(metrics)):
    metric = df_names[metric_index]

    for feature in ['race', 'eth', 'sex']:
      mean_di_metric_over_eps = []
      var_di_metric_over_eps = []


      for ith_eps in eps_arr:
        df_ith_eps = df[df.eps == ith_eps]
        di_metrics_over_runs = []

        for jth_run in runs_arr:
          di_ith_eps_jth_run = df_ith_eps[df_ith_eps.run == jth_run]

          num_metric_proc = di_ith_eps_jth_run[(di_ith_eps_jth_run[col]==1) & (di_ith_eps_jth_run[feature]==0)].shape[0]
          num_total_proc = di_ith_eps_jth_run[di_ith_eps_jth_run[col]==0].shape[0]

          num_metric_unproc = di_ith_eps_jth_run[(di_ith_eps_jth_run[col]==1) & (di_ith_eps_jth_run[feature]==1)].shape[0]
          num_total_unproc = di_ith_eps_jth_run[di_ith_eps_jth_run[col]==1].shape[0]
          
          # Suppress divide by 0 error
          old_err_state = np.seterr(divide='raise')
          ignored_states = np.seterr(**old_err_state)
          
          numerator = np.divide(num_metric_proc, num_total_proc)
          denominator = np.divide(num_metric_unproc,num_total_unproc)

          di_metrics_over_runs.append(np.divide(numerator,denominator))

        mean_di_metric_over_eps.append(np.nanmean(di_metrics_over_runs))
        var_di_metric_over_eps.append(np.nanvar(di_metrics_over_runs))

      mean_columns.append(mean_di_metric_over_eps)
      mean_column_name = 'mean_di_{}_{}'.format(metrics[metric_index], feature)
      mean_column_names.append(mean_column_name)

      var_columns.append(var_di_metric_over_eps)
      var_column_name = 'var_di_{}_{}'.format(metrics[metric_index], feature)
      var_column_names.append(var_column_name)

  mean_df_cols_dict = dict(zip(np.array(mean_column_names), np.array(mean_columns)))
  var_df_cols_dict = dict(zip(np.array(var_column_names), np.array(var_columns)))


  # combine columns into a full dictionary
  mean_df_cols_dict.update(var_df_cols_dict)        # variance columns
  mean_df_cols_dict.update({'auc':auc_over_eps})    # auc column
  mean_df_cols_dict.update({'eps':eps_arr})         # eps column

  # transform dictionary into dataframe
  results_df = pd.DataFrame(mean_df_cols_dict)

  # save results to csv
  outfile = 'results_{}.csv'.format(filename)
  results_df.to_csv(outfile)

## Generate output files

In [None]:
filename = 'nn_4states_balanced_action_cv_output.csv'
df = pd.read_csv(filename)
  
eps_threshold_dict = locate_thresholds(df)
expanded_df = expand_dataframe(df, eps_threshold_dict)
create_results_csv(expanded_df)