This notebook performs audit for all different settings models/audit regions using the scan statistic approach where monte carlo simulation are performed to define the significant thresholds. We use the original scan statistic method from the audit paper which defines the statistic considering the maximum ligelikoods of the inside=outside hypothesis and the inside!=outside hypothesis. We consider the outputs as ground truth, i.e. which regions are classified as significant/non-significant. So the ground truth is 0/1 for each of the audit regions. Additionally we perform audit where we replace the statistic formula with the PROMIS approximation formula. Finally we compute the of accuracy PROMIS approximation for each experiment

In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))
from utils.stats_utils import get_signif_threshold, scan_regions
from utils.data_utils import  get_y
from utils.results_names_utils import combine_world_info, get_train_val_test_paths
from utils.data_utils import read_scanned_regs
from tqdm import tqdm
from utils.data_utils import get_pos_info_regions
from utils.stats_utils import get_random_types

In [2]:
base_path = "../../data/"
xgb_clf_name = "xgb"
dnn_clf_name = "dnn"
unfair_clf_name = "semi_synthetic"
crime_dataset_name = "crime"
lar_dataset_name = "lar"
non_over_partioning_type_name = "non_overlap_k_8"
over_partioning_type_name = "overlap_k_10_radii_4"
grid_partitioning_type_name = "5_x_5"
non_over_partioning_type_name_lar = "non_overlap_k_100"
over_partioning_type_name_lar = "overlap_k_100_radii_30"

In [3]:
seed = 42
np.random.seed(seed)
signif_level = 0.005
n_alt_worlds = 1000

##  Approximation Audit Functions

In [4]:
def get_regs_norm_counts(points_per_region):
    """
    Calculate normalized region membership counts for each region.

    This function counts how many times each point belongs to a region, 
    and then computes a normalized "weight" for each region based on 
    the inverse of that membership count.

    Args:
        points_per_region (list of lists): 
            A list where each element is a list of point indices 
            corresponding to a region.

    Returns:
        list: A list of normalized counts (floats) for each region.
              Specifically, for each region, the sum over all its points 
              of (1 / count of regions that point belongs to).
    """

    point_idx_to_regs_cnt = {}
    for pts in points_per_region:
        for point in pts:
            if point in point_idx_to_regs_cnt:
                point_idx_to_regs_cnt[point] += 1
            else:
                point_idx_to_regs_cnt[point] = 1

    norm_regions_cnts = []
    for pts in points_per_region:
        region_pts_weights_sum = 0
        for point in pts:
            region_pts_weights_sum += 1 / point_idx_to_regs_cnt[point]
        norm_regions_cnts.append(region_pts_weights_sum)

    return norm_regions_cnts

def compute_promis_app(w, n, p, N, P):
    """
    Compute the inside-outside statistic for a given region.

    This statistic compares the proportion of "positive" points (p) 
    inside a region to the proportion of positive points (p_out) 
    outside the region. 

    Args:
        w (float): A weight factor for the region.
        n (int): Number of points inside the region.
        p (int): Number of positive points inside the region.
        N (int): Total number of points.
        P (int): Total number of positive points.

    Returns:
        float: The absolute difference between inside and outside proportions, 
               scaled by the weight w. Returns 0 if `n` or `n_out` is 0.
    """
    n_out = N - n
    
    if n == 0 or n_out == 0:
        return 0  
    
    p_out = P - p
    
    pr = p / n
    pr_out = p_out / n_out
    
    return np.abs(w * (pr - pr_out))
              
def get_signif_thresh_scanned_regions(
    signif_level, n_alt_worlds, regions, y_pred, N, P, seed=None
):
    """
    Determine the significance threshold for each region using a Monte Carlo approach 
    and scan all regions to label them as significant or not.

    Internally, it calls `get_signif_threshold` to compute the threshold from 
    alternative worlds, and `scan_regions` to compute the actual statistics.

    Args:
        signif_level (float): Significance level (e.g., 0.05).
        n_alt_worlds (int): Number of alternative (random) worlds to generate for threshold.
        regions (list): A list of region dictionaries. Each dictionary has a "points" key 
                        containing indices of points in that region.
        y_pred (array-like): Array of binary predictions (0 or 1) for each point.
        N (int): Total number of points.
        P (int): Total number of positive points in y_pred.
        seed (int, optional): Seed for random number generator. Defaults to None.

    Returns:
        pd.DataFrame: A DataFrame with the following columns:
            - 'signif': boolean indicating whether the region is significant.
            - 'statistic': the statistic value for that region.
            - 'signif_thresh': the significance threshold used.
    """
    signif_thresh = get_signif_threshold(
        signif_level, n_alt_worlds, regions, N, P, seed
    )

    _, _, statistics = scan_regions(regions, y_pred, N, P, verbose=False)

    scanned_regions = []
    for i in range(len(regions)):
        signif = False
        if statistics[i] >= signif_thresh:
            signif = True

        reg = {
            "signif": signif,
            "statistic": statistics[i],
            "signif_thresh": signif_thresh,
        }
        scanned_regions.append(reg)

    df_scanned_regs = pd.DataFrame(scanned_regions)

    return df_scanned_regs
def scan_promis_app_regions(regions, weights, types, N, P):
    """
    Compute the inside-outside statistic for all regions and identify 
    the region with the maximum statistic.

    Args:
        regions (list of dict): List of region dictionaries, 
            each with a "points" key containing point indices.
        weights (list of float): List of precomputed weights for each region.
        types (array-like): Binary array (0 or 1) representing the label/type for each point.
        N (int): Total number of points.
        P (int): Total number of positive points in `types`.

    Returns:
        tuple: (best_region, max_likelihood, statistics)
            - best_region (dict): The region (dictionary) with the highest statistic.
            - max_likelihood (float): The value of the highest statistic.
            - statistics (list of float): The statistic values for all regions.
    """
    statistics = []

    for i in range(len(regions)):
        region = regions[i]
        n = len(region["points"])
        p = np.sum(types[region["points"]])
        statistics.append(compute_promis_app(weights[i], n, p, N, P))

    idx = np.argmax(statistics)
    max_likelihood = statistics[idx]

    return regions[idx], max_likelihood, statistics

def scan_promis_app_alt_worlds(n_alt_worlds, regions, weights, N, P, seed=None):
    """
    For each alternative world (a random realization of types), compute 
    the inside-outside statistic for all regions, identify the region 
    with the maximum statistic, and keep track of it.

    Args:
        n_alt_worlds (int): Number of alternative (random) worlds to generate.
        regions (list of dict): List of region dictionaries.
        weights (list of float): List of weights for each region.
        N (int): Total number of points.
        P (int): Total number of positive points in the original setting.
        seed (int, optional): Random seed. Defaults to None.

    Returns:
        tuple: (alt_worlds, best_statistic_overall)
            - alt_worlds (list of tuples): Each tuple has the form 
              (alt_types, alt_best_region, alt_max_likeli).
            - best_statistic_overall (float): The maximum statistic observed 
              among all alternative worlds (the top of alt_worlds when sorted).
    """
    alt_worlds = []
    current_seed = seed

    for _ in tqdm(range(n_alt_worlds), desc="Monte Carlo simulations"):
        alt_types = get_random_types(N, P, current_seed)
        cur_P = np.sum(alt_types)
        alt_best_region, alt_max_likeli, _ = scan_promis_app_regions(
            regions, weights, alt_types, N, cur_P
        )
        alt_worlds.append((alt_types, alt_best_region, alt_max_likeli))

        if current_seed is not None:
            current_seed += 1

    alt_worlds.sort(key=lambda x: -x[2])

    return alt_worlds, alt_worlds[0][2]

def get_promis_app_signif_threshold(
    signif_level, n_alt_worlds, regions, weights, N, P, seed=None
):
    """
    Compute a significance threshold for the inside-outside statistic via Monte Carlo simulations.

    The threshold is determined by generating `n_alt_worlds` alternative worlds, 
    computing the maximum statistic among the regions in each world, and then 
    finding the statistic value at the `signif_level` quantile.

    Args:
        signif_level (float): Significance level (e.g., 0.05).
        n_alt_worlds (int): Number of alternative worlds to generate.
        regions (list of dict): List of region dictionaries.
        weights (list of float): Weights for each region.
        N (int): Total number of points.
        P (int): Total number of positive points.
        seed (int, optional): Random seed. Defaults to None.

    Returns:
        float: Significance threshold for the inside-outside statistic.
    """
    alt_worlds, _ = scan_promis_app_alt_worlds(n_alt_worlds, regions, weights, N, P, seed)

    k = int(signif_level * n_alt_worlds)

    signif_thresh = alt_worlds[k][2]  

    return signif_thresh

def spatial_promis_app_scan_statistic(points_per_region, y_pred, weights,label, n_alt_worlds=1000, signif_level=0.001, seed=None):
    """
    Performs the inside-outside scan statistic on a set of regions and determines 
    which regions are significant based on a Monte Carlo-derived threshold.

    Args:
        points_per_region (list of lists): A list where each element is a list of 
            point indices corresponding to a region.
        y_pred (array-like): Binary predictions (0 or 1) for each point.
        weights (list of float): List of weights corresponding to each region.
        label (str): Label name to differentiate multiple scan statistic runs.
        n_alt_worlds (int, optional): Number of alternative worlds to generate. Defaults to 1000.
        signif_level (float, optional): Significance level. Defaults to 0.05.
        seed (int, optional): Random seed for reproducibility. Defaults to None.

    Returns:
        pd.DataFrame: A DataFrame with columns:
            - '{label}_statistic': Observed statistic for each region.
            - '{label}_signif_thresh': Significance threshold used.
            - '{label}_signif': Boolean indicating significant regions.
    """
    N = len(y_pred)
    P = np.sum(y_pred)
    y_pred = y_pred.copy()
    regions = [{"points": region} for region in points_per_region]

    scanned_regions = []
    
    observed_stats = []
    for i in range(len(points_per_region)):
        pts = points_per_region[i]
        n = len(pts)
        p = np.sum(y_pred[pts])
        I_Z = compute_promis_app(weights[i], n, p, N, P)
        observed_stats.append(I_Z)
    
    signif_thresh = get_promis_app_signif_threshold(
        signif_level, n_alt_worlds, regions, weights, N, P, seed
    )
    
    
    for observed_I in observed_stats:
        signif = False
        if observed_I >= signif_thresh:
            signif = True

        scanned_regions.append({
            f"{label}_statistic": observed_I,
            f"{label}_signif_thresh": signif_thresh,
            f"{label}_signif": signif
        })
    
    return pd.DataFrame(scanned_regions)

def get_scores(true_signif, pred_signif):
    """
    Compute confusion matrix scores (TP, FP, TN, FN) for two binary arrays.

    Args:
        true_signif (array-like): Ground truth significance array (boolean).
        pred_signif (array-like): Predicted significance array (boolean).

    Returns:
        tuple: (tp, fp, tn, fn), where each is an integer.
    """
    tp = np.sum(np.logical_and(true_signif, pred_signif))
    fp = np.sum(np.logical_and(np.logical_not(true_signif), pred_signif))
    tn = np.sum(np.logical_and(np.logical_not(true_signif), np.logical_not(pred_signif)))
    fn = np.sum(np.logical_and(true_signif, np.logical_not(pred_signif)))

    return tp, fp, tn, fn

def run_scan_methods(y_pred, points_per_region, signif_level=0.005, n_alt_worlds=200):
    """
    Run multiple scan methods (traditional scan statistic and 
    inside-outside variations) on a set of regions.

    This function:
        1. Prepares region dictionaries.
        2. Computes region sizes and weights.
        3. Scans regions using a standard significance test (`get_signif_thresh_scanned_regions`).
        4. Scans regions using inside-outside statistics (both original and adjusted weights).
        5. Merges results 
        6. Returns a merged DataFrame of all scan results and a DataFrame of comparison scores.

    Args:
        y_pred (array-like): Binary predictions (0 or 1) for each point.
        points_per_region (list of lists): A list where each element is a list of 
            point indices corresponding to a region.
        signif_level (float, optional): Significance level for thresholds. Defaults to 0.005.
        n_alt_worlds (int, optional): Number of alternative worlds to generate. Defaults to 200.

    Returns:
        tuple:
            - all_scanned_regs_info (pd.DataFrame): Merged scan results.
            - scores_df (pd.DataFrame): Comparison of methods (TP, FP, TN, FN, total significant).
    """
    regions = [{"points": pts} for pts in points_per_region]
    N = len(y_pred)
    P = np.sum(y_pred)
    print(f"N={N}, P={P}, PR={P/N:.3f}")

    n_s = [len(pts) for pts in points_per_region]
    n_out_s = [N - n for n in n_s]
    promis_app_weights = [
        np.sqrt(n * n_out) / (n + n_out) for n, n_out in zip(n_s, n_out_s)
    ]

    df_scanned_regs = get_signif_thresh_scanned_regions(
        signif_level, n_alt_worlds, regions, y_pred, N, P, seed=seed
    )
    total_signif_regs = len(df_scanned_regs[df_scanned_regs["signif"] == True])

    promis_app_scanned_regs_df = spatial_promis_app_scan_statistic(
        points_per_region,
        y_pred,
        n_alt_worlds=n_alt_worlds,
        signif_level=signif_level,
        weights=promis_app_weights,
        seed=seed,
        label="promis_app",
    )
    total_promis_app_signif_regs = len(
        promis_app_scanned_regs_df[promis_app_scanned_regs_df["promis_app_signif"] == True]
    )

    all_scanned_regs_info = pd.merge(
        df_scanned_regs,
        promis_app_scanned_regs_df,
        left_index=True,
        right_index=True,
    )


    true_signif = df_scanned_regs["signif"].values
    promis_app_pred_signif = promis_app_scanned_regs_df["promis_app_signif"].values

    promis_app_scores = get_scores(true_signif, promis_app_pred_signif)

    scores_df = pd.DataFrame(
        {
            "Method": [
                "Scan Statistics",
                "Promis App Statistics",
            ],
            "TP": [None, promis_app_scores[0]],
            "FP": [None, promis_app_scores[1]],
            "TN": [None, promis_app_scores[2]],
            "FN": [None, promis_app_scores[3]],
            "Total Signif Regions": [
                total_signif_regs,
                total_promis_app_signif_regs,
            ],
        }
    )

    return (
        all_scanned_regs_info,
        scores_df,
    )

In [5]:
all_scores = []

# Scan for XGB Classifier

### Load Data

In [6]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, over_partioning_type_name, xgb_clf_name
)
_, val_path_info, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

val_regions_df = read_scanned_regs(val_path_info["regions"])
val_pred_df = pd.read_csv(val_path_info["predictions"])
val_labels_df = pd.read_csv(val_path_info["labels"])

y_pred_val = get_y(val_pred_df, "pred")
y_true_val = get_y(val_labels_df, "label")

val_pred_df['label'] = y_true_val
val_pts_per_region = val_regions_df['points'].tolist()

## Apply Detection Method On Overlapping Regions

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par_val,
    scores_st_par_val_df,
) = run_scan_methods(
    y_pred_val,
    val_pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par_val.head())
scores_st_par_val_df['Dataset']="Crime"
scores_st_par_val_df['Partitioning Type']="K=10, Radii=4"
scores_st_par_val_df['Classifier']="XGBoost"
scores_st_par_val_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_val_df)
display(scores_st_par_val_df)

### Scan Testing Equal Opportunity

In [None]:
val_pos_y_true_indices, val_pts_per_region_eq_opp = get_pos_info_regions(
    y_true_val, val_pts_per_region
)

pos_val_y_pred = y_pred_val[val_pos_y_true_indices]

(
    all_scanned_regs_info_eq_opp_val,
    scores_eq_opp_val_df,
) = run_scan_methods(
    pos_val_y_pred,
    val_pts_per_region_eq_opp,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_eq_opp_val.head())
scores_eq_opp_val_df['Dataset']="Crime"
scores_eq_opp_val_df['Partitioning Type']="K=10, Radii=4"
scores_eq_opp_val_df['Classifier']="XGBoost"
scores_eq_opp_val_df['Scan Type']="Equal Opportunity"
all_scores.append(scores_eq_opp_val_df)
display(scores_eq_opp_val_df)

## Apply Detection Method On Non-Overlapping Regions

### Load Data

In [9]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, non_over_partioning_type_name, xgb_clf_name
)
_, val_path_info, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

val_regions_df = read_scanned_regs(val_path_info["regions"])
val_pred_df = pd.read_csv(val_path_info["predictions"])
val_labels_df = pd.read_csv(val_path_info["labels"])


y_pred_val = get_y(val_pred_df, "pred")
y_true_val = get_y(val_labels_df, "label")

val_pred_df['label'] = y_true_val
val_pts_per_region = val_regions_df['points'].tolist()

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par_val,
    scores_st_par_val_df,
) = run_scan_methods(
    y_pred_val,
    val_pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par_val.head())
scores_st_par_val_df['Dataset']="Crime"
scores_st_par_val_df['Partitioning Type']="Non-Overlapping K=8"
scores_st_par_val_df['Classifier']="XGBoost"
scores_st_par_val_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_val_df)
display(scores_st_par_val_df)

### Scan Testing Equal Opportunity

In [None]:
val_pos_y_true_indices, val_pts_per_region_eq_opp = get_pos_info_regions(
    y_true_val, val_pts_per_region
)

pos_val_y_pred = y_pred_val[val_pos_y_true_indices]

(
    all_scanned_regs_info_eq_opp_val,
    scores_eq_opp_val_df,
) = run_scan_methods(
    pos_val_y_pred,
    val_pts_per_region_eq_opp,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_eq_opp_val.head())
scores_eq_opp_val_df['Dataset']="Crime"
scores_eq_opp_val_df['Partitioning Type']="Non-Overlapping K=8"
scores_eq_opp_val_df['Classifier']="XGBoost"
scores_eq_opp_val_df['Scan Type']="Equal Opportunity"
all_scores.append(scores_eq_opp_val_df)
display(scores_eq_opp_val_df)

## Apply Detection Method On Grid with max RowsXColumns: 5x5

### Load Data

In [12]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, grid_partitioning_type_name, xgb_clf_name
)
_, val_path_info, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

val_regions_df = read_scanned_regs(val_path_info["regions"])
val_pred_df = pd.read_csv(val_path_info["predictions"])
val_labels_df = pd.read_csv(val_path_info["labels"])


y_pred_val = get_y(val_pred_df, "pred")
y_true_val = get_y(val_labels_df, "label")

val_pred_df['label'] = y_true_val
val_pts_per_region = val_regions_df['points'].tolist()

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par_val,
    scores_st_par_val_df,
) = run_scan_methods(
    y_pred_val,
    val_pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par_val.head())
scores_st_par_val_df['Dataset']="Crime"
scores_st_par_val_df['Partitioning Type']="Max 5x5 Grid"
scores_st_par_val_df['Classifier']="XGBoost"
scores_st_par_val_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_val_df)
display(scores_st_par_val_df)

### Scan Testing Equal Opportunity

In [None]:
val_pos_y_true_indices, val_pts_per_region_eq_opp = get_pos_info_regions(
    y_true_val, val_pts_per_region
)

pos_val_y_pred = y_pred_val[val_pos_y_true_indices]

(
    all_scanned_regs_info_eq_opp_val,
    scores_eq_opp_val_df,
) = run_scan_methods(
    pos_val_y_pred,
    val_pts_per_region_eq_opp,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_eq_opp_val.head())
scores_eq_opp_val_df['Dataset']="Crime"
scores_eq_opp_val_df['Partitioning Type']="Max 5x5 Grid"
scores_eq_opp_val_df['Classifier']="XGBoost"
scores_eq_opp_val_df['Scan Type']="Equal Opportunity"
all_scores.append(scores_eq_opp_val_df)
display(scores_eq_opp_val_df)

# Scan for DNN

### Load Data

In [15]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, over_partioning_type_name, dnn_clf_name
)
_, val_path_info, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

val_regions_df = read_scanned_regs(val_path_info["regions"])
val_pred_df = pd.read_csv(val_path_info["predictions"])
val_labels_df = pd.read_csv(val_path_info["labels"])


y_pred_val = get_y(val_pred_df, "pred")
y_true_val = get_y(val_labels_df, "label")

val_pred_df['label'] = y_true_val
val_pts_per_region = val_regions_df['points'].tolist()

## Apply Detection Method On Overlapping Regions

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par_val,
    scores_st_par_val_df,
) = run_scan_methods(
    y_pred_val,
    val_pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par_val.head())
scores_st_par_val_df['Dataset']="Crime"
scores_st_par_val_df['Partitioning Type']="Overlapping K=10, Radii=4"
scores_st_par_val_df['Classifier']="DNN"
scores_st_par_val_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_val_df)
display(scores_st_par_val_df)

### Scan Testing Equal Opportunity

In [None]:
val_pos_y_true_indices, val_pts_per_region_eq_opp = get_pos_info_regions(
    y_true_val, val_pts_per_region
)

pos_val_y_pred = y_pred_val[val_pos_y_true_indices]

(
    all_scanned_regs_info_eq_opp_val,
    scores_eq_opp_val_df,
) = run_scan_methods(
    pos_val_y_pred,
    val_pts_per_region_eq_opp,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_eq_opp_val.head())
scores_eq_opp_val_df['Dataset']="Crime"
scores_eq_opp_val_df['Partitioning Type']="Overlapping K=10, Radii=4"
scores_eq_opp_val_df['Classifier']="DNN"
scores_eq_opp_val_df['Scan Type']="Equal Opportunity"
all_scores.append(scores_eq_opp_val_df)
display(scores_eq_opp_val_df)

## Apply Detection Method On Non-Overlapping Regions

### Load Data

In [18]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, non_over_partioning_type_name, dnn_clf_name
)
_, val_path_info, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

val_regions_df = read_scanned_regs(val_path_info["regions"])
val_pred_df = pd.read_csv(val_path_info["predictions"])
val_labels_df = pd.read_csv(val_path_info["labels"])


y_pred_val = get_y(val_pred_df, "pred")
y_true_val = get_y(val_labels_df, "label")

val_pred_df['label'] = y_true_val
val_pts_per_region = val_regions_df['points'].tolist()

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par_val,
    scores_st_par_val_df,
) = run_scan_methods(
    y_pred_val,
    val_pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par_val.head())
scores_st_par_val_df['Dataset']="Crime"
scores_st_par_val_df['Partitioning Type']="Non-Overlapping K=8"
scores_st_par_val_df['Classifier']="DNN"
scores_st_par_val_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_val_df)
display(scores_st_par_val_df)

### Scan Testing Equal Opportunity

In [None]:
val_pos_y_true_indices, val_pts_per_region_eq_opp = get_pos_info_regions(
    y_true_val, val_pts_per_region
)

pos_val_y_pred = y_pred_val[val_pos_y_true_indices]

(
    all_scanned_regs_info_eq_opp_val,
    scores_eq_opp_val_df,
) = run_scan_methods(
    pos_val_y_pred,
    val_pts_per_region_eq_opp,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_eq_opp_val.head())
scores_eq_opp_val_df['Dataset']="Crime"
scores_eq_opp_val_df['Partitioning Type']="Non-Overlapping K=8"
scores_eq_opp_val_df['Classifier']="DNN"
scores_eq_opp_val_df['Scan Type']="Equal Opportunity"
all_scores.append(scores_eq_opp_val_df)
display(scores_eq_opp_val_df)

## Apply Detection Method On Grid with max RowsXColumns: 5x5

### Load Data

In [21]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, grid_partitioning_type_name, dnn_clf_name
)
_, val_path_info, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

val_regions_df = read_scanned_regs(val_path_info["regions"])
val_pred_df = pd.read_csv(val_path_info["predictions"])
val_labels_df = pd.read_csv(val_path_info["labels"])


y_pred_val = get_y(val_pred_df, "pred")
y_true_val = get_y(val_labels_df, "label")

val_pred_df['label'] = y_true_val
val_pts_per_region = val_regions_df['points'].tolist()

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par_val,
    scores_st_par_val_df,
) = run_scan_methods(
    y_pred_val,
    val_pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par_val.head())
scores_st_par_val_df['Dataset']="Crime"
scores_st_par_val_df['Partitioning Type']="Max 5x5 Grid"
scores_st_par_val_df['Classifier']="DNN"
scores_st_par_val_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_val_df)
display(scores_st_par_val_df)

### Scan Testing Equal Opportunity

In [None]:
val_pos_y_true_indices, val_pts_per_region_eq_opp = get_pos_info_regions(
    y_true_val, val_pts_per_region
)

pos_val_y_pred = y_pred_val[val_pos_y_true_indices]

(
    all_scanned_regs_info_eq_opp_val,
    scores_eq_opp_val_df,
) = run_scan_methods(
    pos_val_y_pred,
    val_pts_per_region_eq_opp,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_eq_opp_val.head())
scores_eq_opp_val_df['Dataset']="Crime"
scores_eq_opp_val_df['Partitioning Type']="Max 5x5 Grid"
scores_eq_opp_val_df['Classifier']="DNN"
scores_eq_opp_val_df['Scan Type']="Equal Opportunity"
all_scores.append(scores_eq_opp_val_df)
display(scores_eq_opp_val_df)

# Scan for LAR

### Load Data

In [24]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    lar_dataset_name, over_partioning_type_name_lar, ""
)
train_path_info, _, _ = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, lar_dataset_name
)

regions_df = read_scanned_regs(train_path_info["regions"])
pred_df = pd.read_csv(f"{base_path}preprocess/lar.csv")
y_pred = get_y(pred_df, "label")
pts_per_region = regions_df['points'].tolist()

## Apply Detection Method On Overlapping Regions

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par,
    scores_st_par_df,
) = run_scan_methods(
    y_pred,
    pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par.head())
scores_st_par_df['Dataset']="LAR"
scores_st_par_df['Partitioning Type']="Overlapping K=100, Radii=30"
scores_st_par_df['Classifier']="-"
scores_st_par_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_df)
display(scores_st_par_df)

## Apply Detection Method On Non-Overlapping Regions

### Load Data

In [26]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    lar_dataset_name, non_over_partioning_type_name_lar, ""
)
train_path_info, _, _ = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, lar_dataset_name
)

regions_df = read_scanned_regs(train_path_info["regions"])
pred_df = pd.read_csv(f"{base_path}preprocess/lar.csv")
y_pred = get_y(pred_df, "label")
pts_per_region = regions_df['points'].tolist()

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par,
    scores_st_par_df,
) = run_scan_methods(
    y_pred,
    pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par.head())
scores_st_par_df['Dataset']="LAR"
scores_st_par_df['Partitioning Type']="Non-Overlapping K=100"
scores_st_par_df['Classifier']="-"
scores_st_par_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_df)
display(scores_st_par_df)

## Apply Detection Method On Grid with max RowsXColumns: 5x5

### Load Data

In [28]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    lar_dataset_name, grid_partitioning_type_name, ""
)
train_path_info, _, _ = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, lar_dataset_name
)

regions_df = read_scanned_regs(train_path_info["regions"])
pred_df = pd.read_csv(f"{base_path}preprocess/lar.csv")
y_pred = get_y(pred_df, "label")
pts_per_region = regions_df['points'].tolist()

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par,
    scores_st_par_df,
) = run_scan_methods(
    y_pred,
    pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par.head())
scores_st_par_df['Dataset']="LAR"
scores_st_par_df['Partitioning Type']="Max 5x5 Grid"
scores_st_par_df['Classifier']="-"
scores_st_par_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_df)
display(scores_st_par_df)

# Scan for Semi Synthetic: Crime Coordinates, Unfair By Design Predictions

### Load Data

In [30]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, over_partioning_type_name, f"{unfair_clf_name}_regions_{over_partioning_type_name}"
)
_, _, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

regions_df = read_scanned_regs(test_path_info["regions"])
pred_df = pd.read_csv(test_path_info["predictions"])
y_pred = get_y(pred_df, "pred")
pts_per_region = regions_df['points'].tolist()

## Apply Detection Method On Overlapping Regions

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par,
    scores_st_par_df,
) = run_scan_methods(
    y_pred,
    pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par.head())
scores_st_par_df['Dataset']="Semi-Synthetic"
scores_st_par_df['Partitioning Type']="Overlapping K=10, Radii=4"
scores_st_par_df['Classifier']="Unfair by Design"
scores_st_par_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_df)
display(scores_st_par_df)

## Apply Detection Method On Non-Overlapping Regions

### Load Data

In [32]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, non_over_partioning_type_name, f"{unfair_clf_name}_regions_{non_over_partioning_type_name}"
)
_, _, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

regions_df = read_scanned_regs(test_path_info["regions"])
pred_df = pd.read_csv(test_path_info["predictions"])
y_pred = get_y(pred_df, "pred")
pts_per_region = regions_df['points'].tolist()

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par,
    scores_st_par_df,
) = run_scan_methods(
    y_pred,
    pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par.head())
scores_st_par_df['Dataset']="Semi-Synthetic"
scores_st_par_df['Partitioning Type']="Non-Overlapping K=10"
scores_st_par_df['Classifier']="Unfair by Design"
scores_st_par_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_df)
display(scores_st_par_df)

## Apply Detection Method On Grid with max RowsXColumns: 5x5

### Load Data

In [34]:
res_desc_label, partioning_name, prediction_name = combine_world_info(
    crime_dataset_name, grid_partitioning_type_name, f"{unfair_clf_name}_regions_{grid_partitioning_type_name}"
)
_, _, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

regions_df = read_scanned_regs(test_path_info["regions"])
pred_df = pd.read_csv(test_path_info["predictions"])
y_pred = get_y(pred_df, "pred")
pts_per_region = regions_df['points'].tolist()

### Scan Testing Statistical Parity

In [None]:
(
    all_scanned_regs_info_st_par,
    scores_st_par_df,
) = run_scan_methods(
    y_pred,
    pts_per_region,
    signif_level=signif_level,
    n_alt_worlds=n_alt_worlds,
)

display(all_scanned_regs_info_st_par.head())
scores_st_par_df['Dataset']="Semi-Synthetic"
scores_st_par_df['Partitioning Type']="Max 5x5 Grid"
scores_st_par_df['Classifier']="Unfair by Design"
scores_st_par_df['Scan Type']="Statistical Parity"
all_scores.append(scores_st_par_df)
display(scores_st_par_df)

# Results

In [None]:
all_scores_df = pd.concat(all_scores, ignore_index=True)
# all_scores_df.to_csv(f"../../results/audit_scores.csv", index=False)
display(all_scores_df)
approx_scores_df = all_scores_df.dropna(subset=["TP", "FP", "TN", "FN"]).astype({"TP": float, "FP": float, "TN": float, "FN": float})
display(approx_scores_df)

In [37]:
def get_final_scores(TP, FP, TN, FN):
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return accuracy, precision, recall, f1_score

In [None]:
TP_total = approx_scores_df["TP"].sum()
FP_total = approx_scores_df["FP"].sum()
TN_total = approx_scores_df["TN"].sum()
FN_total = approx_scores_df["FN"].sum()

accuracy, precision, recall, f1_score = get_final_scores(TP_total, FP_total, TN_total, FN_total)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

In [None]:
approx_scores_df[["Accuracy", "Precision", "Recall", "F1"]] = approx_scores_df.apply(
    lambda row: pd.Series(get_final_scores(row["TP"], row["FP"], row["TN"], row["FN"])), axis=1
)
approx_scores_df=approx_scores_df[['Dataset', 'Partitioning Type', 'Classifier', 'Scan Type', 'Total Signif Regions', 'Accuracy', 'Precision', 'Recall', 'F1']]
# approx_scores_df.to_csv(f"../../results/audit_scores_approx.csv", index=False)
display(approx_scores_df)

In [None]:
approx_scores_df[['Dataset', 'Partitioning Type', 'Classifier', 'Scan Type', 'Total Signif Regions']]