In this notebook we show PROMIS Approximation application in the CRIME dataset, where the audit regions are in total 8 generated from KMeans. 

We show through visualization the initial spatial bias and the results of the mitigation proccess by using the PROMIS approach to adjust the decision boundaries

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join("..")))

from methods.models.optimization_model import SpatialOptimFairnessModel
from utils.data_utils import read_scanned_regs, get_y, get_pos_info_regions
from utils.results_names_utils import combine_world_info, get_train_val_test_paths
import pandas as pd
from utils.scores import get_mlr, get_fair_stat_ratios
from utils.stats_utils import get_signif_threshold
from utils.plot_utils import plot_fairness_map, plot_thresholds_adjustments
import numpy as np
from utils.geo_utils import compute_polygons, filterbbox 

## Read Data

In [2]:
# read data
base_path = "../data/"
clf_name = "xgb"
dataset_name = "crime"
partioning_type_name = "non_overlap_k_8"
fairness_notion = "equal_opportunity"

results = {}
res_desc_label, partioning_name, prediction_name = combine_world_info(
    dataset_name, partioning_type_name, clf_name
)
_, val_path_info, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, dataset_name
)
val_regions_df = read_scanned_regs(val_path_info["regions"])
val_pred_df = pd.read_csv(val_path_info["predictions"])
val_labels_df = pd.read_csv(val_path_info["labels"])
y_pred_val = get_y(val_pred_df, "pred")
y_pred_probs_val = get_y(val_pred_df, "prob")
y_true_val = get_y(val_labels_df, "label")
test_regions_df = read_scanned_regs(test_path_info["regions"])
test_pred_df = pd.read_csv(test_path_info["predictions"])
test_labels_df = pd.read_csv(test_path_info["labels"])
y_pred_test = get_y(test_pred_df, "pred")
y_pred_probs_test = get_y(test_pred_df, "prob")
y_true_test = get_y(test_labels_df, "label")
val_points_per_region = val_regions_df["points"].tolist()
test_points_per_region = test_regions_df["points"].tolist()

In [3]:
# keep instances with positive labels (for equal opportunity)
pos_y_true_indices_test, pos_points_per_region_test = get_pos_info_regions(
    y_true_test, test_points_per_region
)
y_pred_pos_test = y_pred_test[pos_y_true_indices_test]
pos_test_regions_df = test_regions_df.copy()
pos_test_regions_df['points'] = pos_points_per_region_test

In [None]:
# compute initial statistics

N = len(y_pred_pos_test)
P = np.sum(y_pred_pos_test)
print(f'N={N} points')
print(f'P={P} positives') #positives being 'serious crimes' == 1 and negative class: 'non-serious' crimes = 0 (predicted by RF classifier)
test_pred_df.head()

mlr_test, stats_test = get_mlr(y_pred_pos_test, pos_points_per_region_test, True)
test_regions_df['stat'] = stats_test
signif_thresh_test = get_signif_threshold(0.005, 400, [{"points": pts} for pts in pos_points_per_region_test], N, P, seed=42)
signif_regs_indices_test = [i for i, stat in enumerate(stats_test) if stat >= signif_thresh_test]
print(f"Test MLR (Equal Opportunity): {mlr_test:.3f}")
print(f"Total Significant Regions: {len(signif_regs_indices_test)}")

In [5]:
# determine bounding box for display to avoid plotting outliers
bbox_min_lon=-118.6673
bbox_min_lat=33.707
bbox_max_lon=-118.16
bbox_max_lat=34.3374

In [6]:
# keep instances with positive labels (for equal opportunity)

pos_test_regions_df = test_regions_df.copy()
pos_test_regions_df['points'] = pos_points_per_region_test

pos_test_regions_df['pos_pr'] = pos_test_regions_df['points'].apply(lambda pts: sum(y_pred_pos_test[pts])/len(pts) if len(pts) > 0 else 0)
PR_test = sum(y_pred_pos_test)/len(y_pred_pos_test)

pos_test_regions_df["fair_stat_ratio"], max_stat_test = get_fair_stat_ratios(
    pos_test_regions_df["stat"].to_numpy(),
    pos_test_regions_df["pos_pr"].to_numpy(),
    PR_test,
)

In [7]:
# keep instances in bounding box for display
sub_df = test_pred_df[test_pred_df.index.isin(pos_y_true_indices_test)]
sub_df = sub_df.reset_index(drop=True)

sub_df = filterbbox(sub_df, bbox_min_lon, bbox_min_lat, bbox_max_lon, bbox_max_lat)
sub_test_pos_regions_df = pos_test_regions_df.copy()
set_new_pts = set(sub_df.index.tolist())
sub_test_pos_regions_df['points'] = sub_test_pos_regions_df['points'].apply(lambda pts: list(set(pts) & set_new_pts))

old_2_new_idx = {}
for i, ind in enumerate(sub_df.index):
    old_2_new_idx[ind] = i

sub_df = sub_df.reset_index(drop=True)
sub_test_pos_regions_df['points'] = sub_test_pos_regions_df['points'].apply(lambda pts: [old_2_new_idx[p] for p in pts])
sub_test_pos_regions_df = compute_polygons(sub_test_pos_regions_df, sub_df)

y_pred_test_sub = get_y(sub_df, "pred")

In [None]:
# shouls normalized LR
plot_fairness_map(
    regs_df_list=[sub_test_pos_regions_df],
    title="XGBoost Predictions - Normalized LR",
    score_label="fair_stat_ratio",
)

In [None]:
# show the original thresholds and the respective normalized LR with colors
figsize = (12, 6)
plot_thresholds_adjustments(
    thresholds=[0.5]*len(test_points_per_region),  
    region_sizes=sub_test_pos_regions_df['fair_stat_ratio'].to_numpy(),
    figsize=figsize,
    display_title=True,
    title="Classification Threshold per Region of the XGBoost Model"
)

In [None]:
# apply PROMIS Approximation mitigation method (equal opportunity)
budget = 5000
no_of_threads=0
fair_model = SpatialOptimFairnessModel("promis_app")
fair_model.multi_fit(
    points_per_region=val_points_per_region,
    n_flips_start = budget,
    step=10,
    n_flips=budget,
    y_pred=y_pred_val,
    y_true=y_true_val,
    y_pred_probs=y_pred_probs_val,
    wlimit=300,
    fair_notion=fairness_notion,
    overlap=True,
    no_of_threads=no_of_threads,
    verbose=1,
)

In [11]:
# Extract the new thresholds
thresholds = fair_model.budget_to_solution_info[budget]["new_regions_thresh"]

test_orig_preds = y_pred_probs_test > 0.5

# Apply the new thresholds and get the new predictions
test_new_preds = fair_model.multi_predict(test_points_per_region, y_pred_probs_test, [budget])
test_new_preds = test_new_preds[budget]

In [None]:
# keep instances with positive labels (for equal opportunity)
# and compute new statistics
test_new_preds_pos = test_new_preds[pos_y_true_indices_test]
N_test_new = len(test_new_preds_pos)
P_test_new = np.sum(test_new_preds_pos)
print(f'N={N_test_new} points')
print(f'P={P_test_new} positives') #positives being 'serious crimes' == 1 and negative class: 'non-serious' crimes = 0 (predicted by RF classifier)

mlr_test, stats_test = get_mlr(test_new_preds_pos, pos_points_per_region_test, True)
pos_test_regions_df['new_stat'] = stats_test
signif_thresh_test = get_signif_threshold(0.005, 400, [{"points": pts} for pts in pos_points_per_region_test], N_test_new, P_test_new ,seed=42)
signif_regs_indices_test = [i for i, stat in enumerate(stats_test) if stat >= signif_thresh_test]
print(f"Test MLR (Equal Opportunity): {mlr_test:.3f}")
print(f"Total Significant Regions: {len(signif_regs_indices_test)}")

In [13]:
# compute normalized LR and keep instances in bounding box for display
pos_test_regions_df['new_pos_pr'] = pos_test_regions_df['points'].apply(lambda pts: sum(test_new_preds_pos[pts])/len(pts))
PR_test_new = sum(test_new_preds_pos)/len(test_new_preds_pos)

pos_test_regions_df["new_fair_stat_ratio"], _ = get_fair_stat_ratios(
    pos_test_regions_df["new_stat"].to_numpy(),
    pos_test_regions_df["new_pos_pr"].to_numpy(),
    PR_test_new,
    max_stat_test
)
sub_test_pos_regions_df['new_fair_stat_ratio'] = pos_test_regions_df["new_fair_stat_ratio"]
sub_test_pos_regions_df['new_stat'] = pos_test_regions_df["new_stat"]
sub_test_pos_regions_df['new_pos_pr'] = pos_test_regions_df["new_pos_pr"]

In [None]:
sub_test_pos_regions_df[["pos_pr", "new_pos_pr", "fair_stat_ratio", "new_fair_stat_ratio"]]

In [None]:
# show the original thresholds and the respective normalized LR with colors
figsize = (12, 6)
plot_thresholds_adjustments(
    thresholds=thresholds,  # List of threshold values
    region_sizes=sub_test_pos_regions_df['new_fair_stat_ratio'].to_numpy(),
    figsize=figsize,
    display_title=True,
    title="Classification Thresholds per Region After Mitigation"
)


In [None]:
# show the new normalized LR after mitigation
plot_fairness_map(
    regs_df_list=[sub_test_pos_regions_df],
    title="Mitigated Predictions - Normalized LR",
    score_label="new_fair_stat_ratio",
)