This notebook is designed for preprocessing the CRIME, LAR datasets, training XGB on CRIME, generating predictions, and creating spatial partitionings. 

The paths to the neccessary read/write folders are set to the below values:

**Paths**
* base_path: The main directory where all dataset-related files are stored. The default value is *../data/*
* datasets_base_path: Path where the raw datasets are stored. The default value is *base_path/datasets/*
* predictions_base_path: Path where base model predictions are saved. The default value is *base_path/predictions/*
* preprocess_path: Path where true labels are stored. The default value is *base_path/preprocess/*
* clf_base_path: Path where trained machine learning models are saved. The default value is *base_path/clf/*
* partioning_base_path: Path where partitioning results are saved. The default value is *base_path/partitionings/*
* crime_data_filename: The path to the crime dataset. The default value is *base_path/datasets/Crime_Data_from_2010_to_2019.csv.*
* lar_data_filename: The path to the crime dataset. The default value is *base_path/datasets/B4TYDEB6GKMZO031MB27_header.csv.*
* census_gazetteer_data_filename: The path to the crime dataset. The default value is *base_path/datasets/2021_Gaz_tracts_national.txt.*

**Produced Filenames Interpretation**
* partitionings: \<set name>\_regions_\<partitioning name>_\<dataset name>.csv.
* partitionings_ids: \<set name>\_regions_\<partitioning name>_\<dataset name>_ids.csv: same audit regions with partitionings in format where each id refers to a partitioning.
* predictions: \<set_name>_pred_<clf_name>_\<dataset>
where:

* \<set name>: train/val/set
* \<partitioning name>: 
    * 5_x_5 (ovelapping partitionings with max 5 row and max 5 columns i.e. 24 partitionings in toal excluding the 1x1), 
    * non_overlap_k_\<n\_regions> (non-overlapping regions produced by KMeans with <n\_regions> centers), 
    * overlap_k_\<n\_regions>\_radii_\<n_radii> (overlapping regions starting from <n\_regions> centers defined by KMeans and defing regions with max radius-boarders defined by the <n_radii> total radii).
* \<dataset name>: crime/lar
* \<clf_name>: xgb/dnn/semi\_synthetic\_regions_\<partitioning_name> (the dnn is trained and saved in the src/experiments/dnn_exp.ipynb)

In [None]:
import numpy as np
from utils.xgb_crime import crime_xgb_train_predict
from utils.preprocess import proprocess_crime, preprocess_lar
from utils.create_partitioning import create_kmeans_partioning, create_grid_partitioning
from utils.create_unfair_world import create_unfair_world
from utils.results_names_utils import combine_world_info, get_train_val_test_paths
import os
import zipfile

In [2]:
base_path = "../data/"
datasets_base_path = f"{base_path}datasets/"
preprocess_base_path = f"{base_path}preprocess/"
predictions_base_path = f"{base_path}predictions/"
clf_base_path = f"{base_path}clf/"
partioning_base_path = f"{base_path}partitionings/"
crime_data_filename = f"{datasets_base_path}Crime_Data_from_2010_to_2019.csv"
lar_data_filename = f"{datasets_base_path}B4TYDEB6GKMZO031MB27_header.csv"
census_gazetteer_data_filename = f"{datasets_base_path}2021_Gaz_tracts_national.txt"
datasets_zip_fname = f"{datasets_base_path}datasets.zip"
datasets_extract_path = f"{datasets_base_path}"

for dir in [datasets_base_path, predictions_base_path, clf_base_path, partioning_base_path, preprocess_base_path]:
    os.makedirs(dir, exist_ok=True)

### Unzip datasets files

In [None]:
if os.path.exists(datasets_zip_fname):
    with zipfile.ZipFile(datasets_zip_fname, 'r') as zip_ref:
        zip_ref.extractall(datasets_extract_path)
    print("All files extracted successfully.")
else:
    print(f"File {datasets_zip_fname} does not exist!")

### Set - Combine the neccessary paths, labels descriptions

In [3]:
crime_dataset_name = "crime"
lar_dataset_name = "lar"

crime_xgb_clf_name = "xgb_crime"
crime_xgb_predictions_name = f"pred_{crime_xgb_clf_name}"
crime_xgb_train_predictions_filename = f"{predictions_base_path}train_{crime_xgb_predictions_name}.csv"
crime_xgb_val_predictions_filename = f"{predictions_base_path}val_{crime_xgb_predictions_name}.csv"
crime_xgb_test_predictions_filename = f"{predictions_base_path}test_{crime_xgb_predictions_name}.csv"
clf_xgb_filename = f"{clf_base_path}{crime_xgb_clf_name}.joblib"

lar_labels_filename = f"{preprocess_base_path}lar.csv"

### Preprocessing and Model Training

In [None]:
lar_df = preprocess_lar(lar_data_filename, census_gazetteer_data_filename, preprocess_base_path)
print(f"LAR total rows: {lar_df.shape[0]}")
print(f"LAR total positive rows: {lar_df['label'].sum()}")
lar_pr = lar_df['label'].sum() / lar_df.shape[0]
print(f"LAR positive rate: {lar_pr:.2f}")
display(lar_df.head())

In [None]:
X_train_crime, X_val_crime, X_test_crime, y_train_crime, y_val_crime, y_test_crime = proprocess_crime(crime_data_filename, preprocess_base_path)

crime_total_positives =  y_train_crime.sum() + y_val_crime.sum() + y_test_crime.sum() 
crime_total_rows = y_train_crime.shape[0] + y_val_crime.shape[0] + y_test_crime.shape[0]
crime_total_pr = crime_total_positives / crime_total_rows

print(f"Crime total rows: {crime_total_rows}")
print(f"Crime total positives: {crime_total_positives}")
print(f"Crime total (true) positive rate: {crime_total_pr:.2f}")

print(f"X_train_crime.shape: {X_train_crime.shape}")
print(f"X_val_crime.shape: {X_val_crime.shape}")
print(f"X_test_crime.shape: {X_test_crime.shape}")
print(f"y_train_crime.shape: {y_train_crime.shape}")
print(f"y_val_crime.shape: {y_val_crime.shape}")
print(f"y_test_crime.shape: {y_test_crime.shape}")

display(X_train_crime.head())


In [7]:
# This step trains XGBoost on the crime dataset and saves the pretrained model, its predictions and probabilities.
y_train_pred, y_test_pred, y_val_pred = crime_xgb_train_predict(
    X_train=X_train_crime,
    X_val=X_val_crime,
    X_test=X_test_crime,
    y_train=y_train_crime,
    crime_train_predictions_filename=crime_xgb_train_predictions_filename,
    crime_val_predictions_filename=crime_xgb_val_predictions_filename,
    crime_test_predictions_filename=crime_xgb_test_predictions_filename,
    clf_filename=clf_xgb_filename,
)

In [None]:
crime_y_train_pr = y_train_pred.sum() / y_train_pred.shape[0]
crime_y_val_pr = y_val_pred.sum() / y_val_pred.shape[0]
crime_y_test_pr = y_test_pred.sum() / y_test_pred.shape[0]
print(f"Positive rate in training predictions: {crime_y_train_pr:.2f}")
print(f"Positive rate in validation predictions: {crime_y_val_pr:.2f}")
print(f"Positive rate in test predictions: {crime_y_test_pr:.2f}")

crime_y_train_tpr = (y_train_pred & y_train_crime).sum() / y_train_crime.sum()
crime_y_val_tpr = (y_val_pred & y_val_crime).sum() / y_val_crime.sum()
crime_y_test_tpr = (y_test_pred & y_test_crime).sum() / y_test_crime.sum()
print(f"Crime true positive rate in training set: {crime_y_train_tpr:.2f}")
print(f"Crime true positive rate in validation set: {crime_y_val_tpr:.2f}")
print(f"Crime true positive rate in test set: {crime_y_test_tpr:.2f}")

crime_val_acc = (y_val_pred == y_val_crime).sum() / y_val_crime.shape[0]
crime_test_acc = (y_test_pred == y_test_crime).sum() / y_test_crime.shape[0]
print(f"Crime validation accuracy: {crime_val_acc:.2f}")
print(f"Crime test accuracy: {crime_test_acc:.2f}")

### Create Audit Regions

#### Creating Overlapping Partitioning

Divides the data into spatial clusters using the KMeans clustering method for the initial center and with increasing radii starting from each center generates overlapping clusters.

In [None]:
# Divides the crime data into spatial clusters 

overlapping = True
k = 10
radii = np.arange(0.01, 0.1, 0.03)
create_kmeans_partioning(
    train_with_loc_filename=crime_xgb_train_predictions_filename,
    val_with_loc_filename=crime_xgb_val_predictions_filename,
    test_with_loc_filename=crime_xgb_test_predictions_filename,
    partioning_base_path=partioning_base_path,
    dataset_name=crime_dataset_name,
    overlapping=overlapping,
    k=k,
    radii=radii,
    with_partitioning_id=True,
)

In [None]:
# Divides the lar data into spatial clusters 

overlapping = True
k = 100
radii = np.arange(0.05, 1.51, 0.05)
create_kmeans_partioning(
    train_with_loc_filename=lar_labels_filename,
    partioning_base_path=partioning_base_path,
    dataset_name=lar_dataset_name,
    overlapping=overlapping,
    k=k,
    radii=radii,
)

#### Creating Non-Overlapping Partitioning

* overlapping=False to ensure that each data point belongs to only one region.
* Keep k=10 clusters.
* radii=None since overlapping is disabled.


In [None]:
# Divides the crime data into spatial non-overlapping clusters using the KMeans clustering method.
overlapping = False
k = 8
radii = None
create_kmeans_partioning(
    train_with_loc_filename=crime_xgb_train_predictions_filename,
    val_with_loc_filename=crime_xgb_val_predictions_filename,
    test_with_loc_filename=crime_xgb_test_predictions_filename,
    partioning_base_path=partioning_base_path,
    dataset_name=crime_dataset_name,
    overlapping=overlapping,
    k=k,
    radii=radii,
    with_partitioning_id=True,
)

In [None]:
# Divides the lar data into spatial non-overlapping clusters using the KMeans clustering method.

overlapping = False
k = 100
radii = None
create_kmeans_partioning(
    train_with_loc_filename=lar_labels_filename,
    partioning_base_path=partioning_base_path,
    dataset_name=lar_dataset_name,
    overlapping=overlapping,
    k=k,
    radii=radii,
)

#### Create Grid with max rowsXcolumns = 5X5

In [None]:
# Create partitionings with max 5X5 row and columns for CRIME
create_grid_partitioning(
    5,
    5,
    train_with_loc_filename=crime_xgb_train_predictions_filename,
    val_with_loc_filename=crime_xgb_val_predictions_filename,
    test_with_loc_filename=crime_xgb_test_predictions_filename,
    partitioning_dir=partioning_base_path,
    dataset_name=crime_dataset_name,
)

In [None]:
# Create partitionings with max 5X5 row and columns for LAR
create_grid_partitioning(
    5,
    5,
    train_with_loc_filename=lar_labels_filename,
    partitioning_dir=partioning_base_path,
    dataset_name=lar_dataset_name,
)

### Create Crime Semi-Synthetic Datasets

#### Create Unfair By Design World Using Non-Overlapping Regions

In [None]:
clf_name, partioning_type_name, overlap =  "xgb", "non_overlap_k_8", False
res_desc_label, partioning_name, prediction_name = (
    combine_world_info(crime_dataset_name, partioning_type_name, clf_name)
)
_, _, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

create_unfair_world(
    rho=0.8,
    test_path_info=test_path_info,
    predictions_path=predictions_base_path,
    partioning_name=partioning_name,
)

#### Create Unfair By Design World Using Overlapping Regions

In [None]:
clf_name, partioning_type_name, overlap =  "xgb", "overlap_k_10_radii_4", True
res_desc_label, partioning_name, prediction_name = (
    combine_world_info(crime_dataset_name, partioning_type_name, clf_name)
)
_, _, test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

create_unfair_world(
    rho=0.8,
    test_path_info=test_path_info,
    predictions_path=predictions_base_path,
    partioning_name=partioning_name,
)

#### Create Unfair By Design World Using Grid with max rowsXcolumns = 5X5

In [None]:
clf_name, partioning_type_name, overlap =  "xgb", "5_x_5", True
res_desc_label, partioning_name, prediction_name = (
    combine_world_info(crime_dataset_name, partioning_type_name, clf_name)
)
_, _,  test_path_info = get_train_val_test_paths(
    base_path, partioning_name, prediction_name, crime_dataset_name
)

create_unfair_world(
    rho=0.8,
    test_path_info=test_path_info,
    predictions_path=predictions_base_path,
    partioning_name=partioning_name,
)