In this notebook we show PROMIS Approximation application in an unfair by design semi-syntehtic dataset, where the audit regions are in total 8 generated from KMeans. 

We show through visualizations the initial spatial bias and the results of the mitigation proccess by using the PROMIS approach to apply direct flips

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join("..")))

from methods.models.optimization_model import SpatialOptimFairnessModel
from utils.data_utils import read_scanned_regs, get_y
from utils.results_names_utils import combine_world_info, get_train_val_test_paths
import pandas as pd
from utils.scores import get_mlr, get_fair_stat_ratios
from utils.stats_utils import  get_signif_threshold
from utils.plot_utils import plot_map_with_polygons
from utils.plot_utils import plot_fairness_map
import numpy as np
from utils.geo_utils import compute_polygons, filterbbox

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
# read data 
base_path = "../data/"
clf_name = "semi_synthetic_regions_non_overlap_k_8"
dataset_name = "crime"
partioning_type_name = "non_overlap_k_8"
fairness_notion = "statistical_parity"


results = {}
res_desc_label, partioning_name, prediction_name = combine_world_info(
    dataset_name, partioning_type_name, clf_name
)
_, val_path_info, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, dataset_name
)
test_regions_df = read_scanned_regs(test_path_info["regions"])
test_pred_df = pd.read_csv(test_path_info["predictions"])
y_pred_test = get_y(test_pred_df, "pred")
test_fair_pred_df = pd.read_csv(f"{base_path}predictions/test_fair_pred_semi_synthetic_{partioning_name}.csv")
y_pred_fair_test = get_y(test_fair_pred_df, "pred")
test_points_per_region = test_regions_df["points"].tolist()

In [None]:
# compute the initial statistics of the fair (using binomial with positive rate=0.8 to generate predictions) model
init_N = len(y_pred_fair_test)
init_P = np.sum(y_pred_fair_test)
init_PR = init_P / init_N
print(f'N={init_N} points')
print(f'P={init_P} positives') #positives being 'serious crimes' == 1 and negative class: 'non-serious' crimes = 0 (predicted by RF classifier)
print(f'PR={init_PR:.3f}')
test_pred_df.head()

init_mlr_test, init_stats_test = get_mlr(y_pred_fair_test, test_points_per_region, True)
test_regions_df['init_stat'] = init_stats_test
signif_thresh_test_init = get_signif_threshold(0.005, 400, [{"points": pts} for pts in test_points_per_region], init_N, init_P, seed=42)
signif_regs_indices_test_init = [i for i, stat in enumerate(init_stats_test) if stat >= signif_thresh_test_init]
print(f"Test MLR (Equal Opportunity): {init_mlr_test:.3f}")
print(f"Total Significant Regions: {len(signif_regs_indices_test_init)}")

In [None]:
# compute the statistics of th unfair by design model
N = len(y_pred_test)
P = np.sum(y_pred_test)
PR = init_P / init_N
print(f'N={N} points')
print(f'P={P} positives') #positives being 'serious crimes' == 1 and negative class: 'non-serious' crimes = 0 (predicted by RF classifier)
print(f'PR={PR:.3f}')
test_pred_df.head()

mlr_test, stats_test = get_mlr(y_pred_test, test_points_per_region, True)
test_regions_df['stat'] = stats_test
signif_thresh_test = get_signif_threshold(0.005, 400, [{"points": pts} for pts in test_points_per_region], N, P, seed=42)
signif_regs_indices_test = [i for i, stat in enumerate(stats_test) if stat >= signif_thresh_test]
print(f"Test MLR (Equal Opportunity): {mlr_test:.3f}")
print(f"Total Significant Regions: {len(signif_regs_indices_test)}")

In [5]:
# determine bounding box for display to avoid plotting outliers
bbox_min_lon=-118.6673
bbox_min_lat=33.707
bbox_max_lon=-118.16
bbox_max_lat=34.3374

In [6]:
# compute the statistics of the unfair by design model
test_regions_df['pr'] = test_regions_df['points'].apply(lambda pts: sum(y_pred_test[pts])/len(pts) if len(pts) > 0 else 0)
PR_test = sum(y_pred_test)/len(y_pred_test)

test_regions_df["fair_stat_ratio"], max_stat_test = get_fair_stat_ratios(
    test_regions_df["stat"].to_numpy(),
    test_regions_df["pr"].to_numpy(),
    PR_test,
)

In [7]:
# compute the initial statistics of the fair model 
test_regions_df['init_pr'] = test_regions_df['points'].apply(lambda pts: sum(y_pred_fair_test[pts])/len(pts) if len(pts) > 0 else 0)
PR_test_init = sum(y_pred_fair_test)/len(y_pred_fair_test)

test_regions_df["init_fair_stat_ratio"], _ = get_fair_stat_ratios(
    test_regions_df["init_stat"].to_numpy(),
    test_regions_df["init_pr"].to_numpy(),
    PR_test_init,
    max_stat=max_stat_test
)

In [8]:
# get the coordinates inside the bounding box for display

sub_df = test_pred_df.copy()

sub_df = filterbbox(sub_df, bbox_min_lon, bbox_min_lat, bbox_max_lon, bbox_max_lat)
sub_test_regions_df = test_regions_df.copy()
set_new_pts = set(sub_df.index.tolist())
sub_test_regions_df['points'] = sub_test_regions_df['points'].apply(lambda pts: list(set(pts) & set_new_pts))

old_2_new_idx = {}
for i, ind in enumerate(sub_df.index):
    old_2_new_idx[ind] = i

sub_df = sub_df.reset_index()
sub_test_regions_df['points'] = sub_test_regions_df['points'].apply(lambda pts: [old_2_new_idx[p] for p in pts])
sub_test_regions_df = compute_polygons(sub_test_regions_df, sub_df)

y_pred_test_sub = get_y(sub_df, "pred")

In [None]:
# show the normalized LR of the fair by design world
plot_fairness_map(
    regs_df_list=[sub_test_regions_df],
    title="Initial Fair by Design Predictions - Normalized LR",
    score_label="init_fair_stat_ratio",
)

In [10]:
# Get flips done ot create the unfair by design model 
pos_to_neg_pts  = np.where((y_pred_test != y_pred_fair_test) & (y_pred_fair_test == 1))[0]
neg_to_pos_pts  = np.where((y_pred_test != y_pred_fair_test) & (y_pred_fair_test == 0))[0]

In [11]:
# keep flips in the bounding box
sub_pos_to_neg_pts = []
for i, pt in enumerate(pos_to_neg_pts):
    if pt in sub_df['index'].values:
        sub_pos_to_neg_pts.append(sub_df[sub_df['index'] == pt].index[0])

sub_neg_to_pos_pts = []
for i, pt in enumerate(neg_to_pos_pts):
    if pt in sub_df['index'].values:
        sub_neg_to_pos_pts.append(sub_df[sub_df['index'] == pt].index[0])

In [None]:
# Show Flips done ot create the unfair by design model while keeping the initial positive rate
plot_map_with_polygons(
    df=sub_df,
    regs_df_list=[sub_test_regions_df],
    other_idxs=[sub_pos_to_neg_pts, sub_neg_to_pos_pts],
    other_colors=["orange", "green"],
    regs_color_list=["#0000FF"],
    title="Flips to Produce Unfair by Design Predictions",
)

In [None]:
# show the normalized LR of the unfair by design world
plot_fairness_map(
    regs_df_list=[sub_test_regions_df],
    title="Unfair by Design Predictions - Normalized LR",
    score_label="fair_stat_ratio",
)

In [None]:
# show the regions and the points of the unfair by design world
plot_map_with_polygons(
    df=sub_df,
    y_pred=y_pred_test_sub,
    regs_df_list=[sub_test_regions_df],
    regs_color_list=["#0000FF"],
    title="Regions - Points - Unfair by Design Predictions",
)

In [None]:
budget = 29610 # the budget used to make the world unfair startint from the initial fair world
no_of_threads=0
fair_model = SpatialOptimFairnessModel("promis_app") # use PROMIS Approximation
fair_model.multi_fit(
    points_per_region=test_points_per_region,
    n_flips_start = budget,
    step=10,
    n_flips=budget,
    y_pred=y_pred_test,
    wlimit=300,
    fair_notion=fairness_notion,
    overlap=True,
    init_threshold=None,
    no_of_threads=no_of_threads,
    verbose=1,
    max_pr_shift=0.1,
)

In [16]:
# Extract the points to flip and respective flip directions

pts_to_change_sol = fair_model.budget_to_solution_info[budget]["pts_to_change_sol"]
pts_to_change = fair_model.budget_to_solution_info[budget]["pts_to_change"]

# Keep only the points that are in the bounding box for display

sub_pts_to_change_sol_indices = []
sub_pts_to_change = []
for i, pt in enumerate(pts_to_change):
    if pt in sub_df['index'].values:
        sub_pts_to_change.append(sub_df[sub_df['index'] == pt].index[0])
        sub_pts_to_change_sol_indices.append(i)

sub_pts_to_change_sol_indices = np.array(sub_pts_to_change_sol_indices)
sub_pts_to_change_sol = np.array(pts_to_change_sol)[sub_pts_to_change_sol_indices]

sub_pts_to_change = np.array(sub_pts_to_change)

pts_to_change_sol_neg = np.where(sub_pts_to_change_sol == -1)[0]
pts_to_change_sol_pos = np.where(sub_pts_to_change_sol == 1)[0]

pts_to_change_neg = sub_pts_to_change[pts_to_change_sol_neg]
pts_to_change_pos = sub_pts_to_change[pts_to_change_sol_pos]

In [None]:
# Show Flips to make the world fair
plot_map_with_polygons(
    df=sub_df,
    regs_df_list=[sub_test_regions_df],
    other_idxs=[pts_to_change_neg, pts_to_change_pos],
    other_colors=["orange", "green"],
    regs_color_list=["#0000FF"],
    title="Flips to Mitigate Spatial Bias"
)

In [18]:
# Get the new predictions after the applying the flips
test_new_preds = fair_model.multi_predict(test_points_per_region, y_pred_test, apply_fit_flips=True)
test_new_preds = test_new_preds[budget]

In [None]:
# Compute the new statistics
N_test_new = len(test_new_preds)
P_test_new = np.sum(test_new_preds)
PR_test_new = P_test_new/N_test_new
print(f'N={N_test_new} points')
print(f'P={P_test_new} positives') #positives being 'serious crimes' == 1 and negative class: 'non-serious' crimes = 0 (predicted by RF classifier)
print(f'PR={PR_test_new} positives rate')

mlr_test, stats_test = get_mlr(test_new_preds, test_points_per_region, True)
test_regions_df['new_stat'] = stats_test
signif_thresh_test_init = get_signif_threshold(0.005, 400, [{"points": pts} for pts in test_points_per_region], N_test_new, P_test_new, seed=42)
signif_regs_indices_test = [i for i, stat in enumerate(stats_test) if stat >= signif_thresh_test_init]
print(f"Test MLR (Equal Opportunity): {mlr_test:.3f}")
print(f"Total Significant Regions: {len(signif_regs_indices_test)}")

In [20]:
# compute the positive rate for each region and the new normalized LR
test_regions_df['new_pr'] = test_regions_df['points'].apply(lambda pts: sum(test_new_preds[pts])/len(pts))
PR_test_new = sum(test_new_preds)/len(test_new_preds)

test_regions_df["new_fair_stat_ratio"], _ = get_fair_stat_ratios(
    test_regions_df["new_stat"].to_numpy(),
    test_regions_df["new_pr"].to_numpy(),
    PR_test_new,
    max_stat_test
)
test_regions_df['new_fair_stat_ratio'] = test_regions_df["new_fair_stat_ratio"]
sub_test_regions_df['new_fair_stat_ratio'] = test_regions_df["new_fair_stat_ratio"]

In [None]:
test_regions_df[['init_pr', 'pr', "new_pr", "init_fair_stat_ratio", "fair_stat_ratio", "new_fair_stat_ratio"]]

In [22]:
# get the sub population of the predictions which are in the bbox to display
sub_test_new_preds_indices = list(set(list(range(len(test_new_preds)))) & set(sub_df['index'].values))
sub_test_new_preds = test_new_preds[sub_test_new_preds_indices]

In [None]:
# show the new regions and the points of the mitigated predictions
plot_map_with_polygons(
    df=sub_df,
    y_pred=sub_test_new_preds,
    regs_df_list=[sub_test_regions_df],
    regs_color_list=["#0000FF"],
    title="Regions - Points - Mitigated Predictions",
)

In [None]:
# show the normalized LR of the mitigated predictions
plot_fairness_map(
    regs_df_list=[sub_test_regions_df],
    title="Mitigated Predictions - Normalized LR",
    score_label="new_fair_stat_ratio",
)