In [8]:
# Data Processing
import pandas as pd

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from aif360.algorithms.preprocessing import Reweighing
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric


pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'


In [9]:
%pip install aif360[inFairness]


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [10]:

# Load the dataset
raw = pd.read_csv("compas-scores-two-years.csv", index_col=0)
df = pd.read_csv("compas-scores-two-years-preprocessed.csv", index_col=0)
print(raw.head())


                  name   first         last compas_screening_date   sex  \
id                                                                        
1     miguel hernandez  miguel    hernandez            2013-08-14  Male   
3          kevon dixon   kevon        dixon            2013-01-27  Male   
4             ed philo      ed        philo            2013-04-14  Male   
5          marcu brown   marcu        brown            2013-01-13  Male   
6   bouthy pierrelouis  bouthy  pierrelouis            2013-03-26  Male   

           dob  age          age_cat              race  juv_fel_count  ...  \
id                                                                     ...   
1   1947-04-18   69  Greater than 45             Other              0  ...   
3   1982-01-22   34          25 - 45  African-American              0  ...   
4   1991-05-14   24     Less than 25  African-American              0  ...   
5   1993-01-21   23     Less than 25  African-American              0  ...   
6   19

In [11]:
y = df['two_year_recid']
X = df.drop(columns=['two_year_recid'])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_df = X_train.copy()
train_df['two_year_recid'] = y_train
race_ser = raw['race']
train_df = train_df.merge(
    race_ser.rename('race'),
    left_index=True,
    right_index=True,
    how='left'
)

train_df['race_code'] = train_df['race'].map(
    {'African-American':0,'Caucasian':1}
).fillna(-1).astype(int)
train_df = train_df.drop(columns=['race'])


train_df['two_year_recid'] = train_df['two_year_recid'].astype(int)


dataset_orig_train = BinaryLabelDataset(
    df=train_df,
    label_names=['two_year_recid'],
    protected_attribute_names=['race_code'],    # numeric now
    favorable_label=1,
    unfavorable_label=0
)

privileged_groups   = [{"race_code": "Caucasian"}]
unprivileged_groups = [{"race_code": "African-American"}]

rw = Reweighing(
    privileged_groups=privileged_groups,
    unprivileged_groups=unprivileged_groups
)
rw.fit(dataset_orig_train)
dataset_transf_train = rw.transform(dataset_orig_train)

X_tr = dataset_transf_train.features
y_tr = dataset_transf_train.labels.ravel().astype(int)
w_tr = dataset_transf_train.instance_weights

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_tr, y_tr, sample_weight=w_tr)

# Align the features of X_test with X_tr
X_test_aligned = X_test.reindex(columns=dataset_transf_train.feature_names, fill_value=0)
# Make predictions
y_pred = clf.predict(X_test_aligned)
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Post-Reweighing Accuracy:",  accuracy_score(y_test.astype(int),  y_pred))
print("Post-Reweighing Precision:", precision_score(y_test.astype(int), y_pred))
print("Post-Reweighing Recall:",    recall_score(y_test.astype(int),    y_pred))


  self.w_p_fav = n_fav*n_p / (n*n_p_fav)
  self.w_p_unfav = n_unfav*n_p / (n*n_p_unfav)
  self.w_up_fav = n_fav*n_up / (n*n_up_fav)
  self.w_up_unfav = n_unfav*n_up / (n*n_up_unfav)


Post-Reweighing Accuracy: 0.8650116369278511
Post-Reweighing Precision: 0.8203883495145631
Post-Reweighing Recall: 0.771689497716895




In [13]:
# rebuild your results DataFrame as before:


# 2) Build a results DataFrame off X_test (no index reset needed)
results = X_test.copy()
results['true'] = y_test.astype(int)
results['pred'] = y_pred
# 3) Recover gender from your one-hot:
results['gender'] = results['sex_Male'].map({1:'Male', 0:'Female'})

# 4) Recover race from your one-hot columns:
race_cols = [c for c in X_test.columns if c.startswith('race_')]
# idxmax finds which dummy is 1
results['race'] = (results[race_cols]
                       .idxmax(axis=1)          # e.g. 'race_Caucasian'
                       .str.replace('race_','')  # → 'Caucasian'
                   )


# 5) Define a function to compute FPR/FNR for any subgroup
def group_stats(g):
    tn, fp, fn, tp = confusion_matrix(g['true'], g['pred'], labels=[0,1]).ravel()
    return pd.Series({
        'FPR': fp / (fp + tn) if (fp+tn) else 0,
        'FNR': fn / (fn + tp) if (fn+tp) else 0
    })

# 6) Compute and print by race and by gender
by_race   = results.groupby('race').apply(group_stats)
by_gender = results.groupby('gender').apply(group_stats)

print("FPR/FNR by Race:")
print(by_race)
print("\nFPR/FNR by Gender:")
print(by_gender)

FPR/FNR by Race:
                       FPR       FNR
race                                
African-American  0.090909  0.217082
Asian             0.333333  0.000000
Caucasian         0.083942  0.295652
Hispanic          0.057692  0.103448
Native American   0.000000  0.000000
Other             0.074074  0.200000

FPR/FNR by Gender:
             FPR       FNR
gender                    
Female  0.026316  0.108696
Male    0.116861  0.251351


  by_race   = results.groupby('race').apply(group_stats)
  by_gender = results.groupby('gender').apply(group_stats)


# Conclusion:

Accuracy rose from 64% to 86.5%, showing that overall predictive correctness improved once the training set was balanced across race × outcome groups.

Precision nearly doubled (58% → 82%), meaning the post-reweighing model makes far fewer false-positive “high-risk” errors.

Recall jumped from 58% to 77%, so the model now correctly identifies the vast majority of true recidivists rather than missing most of them.

In other words, by reweighing the training examples to remove the spurious correlation between race and recidivism, the Random Forest becomes both much more sensitive (higher recall) and much more specific (higher precision), while also reducing disparate error‐rate gaps across racial groups.