In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('compas-scores-two-years.csv')

print(df['age_cat'].unique())
print(df['sex'].unique())
print(df['race'].unique())
print(df['priors_count'].unique())
print(df['c_charge_degree'].unique())

use_df = df[['sex','age_cat','race',
            'juv_fel_count','juv_misd_count','juv_other_count','priors_count',
            'c_charge_degree', 'decile_score', 'v_decile_score', 'score_text', 'v_score_text', 'is_recid', 'is_violent_recid']]

print(use_df.shape)
print(use_df.columns)
print(use_df.head())
# print(use_df['race'].unique())

# Encode all categorical variables in one step
categorical_columns = ['sex', 'age_cat', 'c_charge_degree'] #, 'score_text', 'v_score_text']
encoded_df = pd.get_dummies(use_df, columns=categorical_columns, drop_first=True)

# Now encode race separately without dropping any category
encoded_df = pd.get_dummies(encoded_df, columns=['race'], drop_first=False)

# Create mapping dictionary for score encoding
score_mapping = {
    'Low': 0,
    'Medium': 1,
    'High': 2
}

encoded_df['score_text'] = encoded_df['score_text'].map(score_mapping)
encoded_df['v_score_text'] = encoded_df['v_score_text'].map(score_mapping)

print("\nShape after encoding:", encoded_df.shape)
print("\nNew columns after encoding:")
print(encoded_df.columns)
print("\nFirst few rows of encoded data:")
print(encoded_df.head())


['Greater than 45' '25 - 45' 'Less than 25']
['Male' 'Female']
['Other' 'African-American' 'Caucasian' 'Hispanic' 'Native American'
 'Asian']
[ 0  4  1  2 14  3  7  6  5 13  8  9 21 20 15 10 12 28 19 11 22 23 25 24
 36 18 16 33 17 30 27 38 26 37 29 35 31]
['F' 'M']
(7214, 14)
Index(['sex', 'age_cat', 'race', 'juv_fel_count', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'c_charge_degree', 'decile_score',
       'v_decile_score', 'score_text', 'v_score_text', 'is_recid',
       'is_violent_recid'],
      dtype='object')
    sex          age_cat              race  juv_fel_count  juv_misd_count  \
0  Male  Greater than 45             Other              0               0   
1  Male          25 - 45  African-American              0               0   
2  Male     Less than 25  African-American              0               0   
3  Male     Less than 25  African-American              0               1   
4  Male          25 - 45             Other              0               0   

In [18]:
# Split the data into features and target variables
X = encoded_df.drop(columns=['decile_score', 'v_decile_score', 'score_text', 'v_score_text', 'is_recid', 'is_violent_recid'])
class_y = encoded_df['is_recid']  # Binary target (0 or 1)
v_class_y = encoded_df['is_violent_recid']

# Create label encoders
le = LabelEncoder()
y_encoded = le.fit_transform(class_y)
v_y_encoded = le.fit_transform(v_class_y)

# Split into train and test sets - do this separately for each target
X_train, X_test, y_train_decile, y_test_decile = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42
)

# For violent recidivism, use the same split indices
_, _, v_y_train, v_y_test = train_test_split(
    X, v_y_encoded, test_size=0.3, random_state=42
)

# Define parameter grid
param_grid = {
    'max_depth': [3],              
    'learning_rate': [0.1],   
    'n_estimators': [200],         
    'min_child_weight': [1],
    'gamma': [0.1],
    'subsample': [1.0],
    'colsample_bytree': [1.0],
    'objective': ['binary:logistic']
}

# Train and evaluate for regular recidivism
print("\n=== Regular Recidivism Prediction ===")
xgb_recid = XGBClassifier(random_state=42, objective='binary:logistic')
grid_search_recid = GridSearchCV(
    estimator=xgb_recid,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

print("Training XGBoost for recidivism prediction...")
grid_search_recid.fit(X_train, y_train_decile)

print("\nBest parameters found:")
print(grid_search_recid.best_params_)
print("\nBest cross-validation score:", grid_search_recid.best_score_)

# Make predictions and evaluate
y_pred_recid = grid_search_recid.best_estimator_.predict(X_test)
print("\nClassification Report for Recidivism Prediction:")
print(classification_report(y_test_decile, y_pred_recid))

# Feature importance for recidivism
feature_importance_recid = pd.DataFrame({
    'feature': X.columns,
    'importance': grid_search_recid.best_estimator_.feature_importances_
})
feature_importance_recid = feature_importance_recid.sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features for Recidivism:")
print(feature_importance_recid.head(10))

# Train and evaluate for violent recidivism
print("\n\n=== Violent Recidivism Prediction ===")
xgb_violent = XGBClassifier(random_state=42, objective='binary:logistic')
grid_search_violent = GridSearchCV(
    estimator=xgb_violent,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

print("Training XGBoost for violent recidivism prediction...")
grid_search_violent.fit(X_train, v_y_train)

print("\nBest parameters found:")
print(grid_search_violent.best_params_)
print("\nBest cross-validation score:", grid_search_violent.best_score_)

# Make predictions and evaluate
y_pred_violent = grid_search_violent.best_estimator_.predict(X_test)
print("\nClassification Report for Violent Recidivism Prediction:")
print(classification_report(v_y_test, y_pred_violent))

# Feature importance for violent recidivism
feature_importance_violent = pd.DataFrame({
    'feature': X.columns,
    'importance': grid_search_violent.best_estimator_.feature_importances_
})
feature_importance_violent = feature_importance_violent.sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features for Violent Recidivism:")
print(feature_importance_violent.head(10))


=== Regular Recidivism Prediction ===
Training XGBoost for recidivism prediction...
Fitting 5 folds for each of 1 candidates, totalling 5 fits

Best parameters found:
{'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'objective': 'binary:logistic', 'subsample': 1.0}

Best cross-validation score: 0.6483384017009722

Classification Report for Recidivism Prediction:
              precision    recall  f1-score   support

           0       0.71      0.72      0.71      1153
           1       0.67      0.66      0.67      1012

    accuracy                           0.69      2165
   macro avg       0.69      0.69      0.69      2165
weighted avg       0.69      0.69      0.69      2165


Top 10 Most Important Features for Recidivism:
                    feature  importance
6      age_cat_Less than 25    0.273978
3              priors_count    0.226027
5   age_cat_Greater than 45    0.149772
4                  sex_Mal

In [19]:
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
import numpy as np

# First, create functions to help with the analysis
def create_aif_dataset(X, y, protected_attribute_names, favorable_label=1, unfavorable_label=0):
    """
    Create an AIF360 BinaryLabelDataset from features and labels
    """
    df = pd.concat([X, pd.Series(y, index=X.index)], axis=1)
    return BinaryLabelDataset(
        df=df,
        label_names=[df.columns[-1]],  # last column is the label
        protected_attribute_names=protected_attribute_names,
        favorable_label=favorable_label,
        unfavorable_label=unfavorable_label
    )

def compute_fairness_metrics(dataset, privileged_groups, unprivileged_groups):
    """
    Compute fairness metrics for a dataset
    """
    metrics = BinaryLabelDatasetMetric(
        dataset,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups
    )
    
    print("Disparate Impact:", metrics.disparate_impact())
    print("Statistical Parity Difference:", metrics.statistical_parity_difference())
    # print("Equalized Odds Difference:", metrics.equalized_odds_difference())
    # print("Equal Opportunity Difference:", metrics.equal_opportunity_difference())
    
def compute_classification_metrics(dataset_true, dataset_pred, privileged_groups, unprivileged_groups):
    """
    Compute classification fairness metrics
    """
    metrics = ClassificationMetric(
        dataset_true,
        dataset_pred,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups
    )
    
    print("Equal Opportunity Difference:", metrics.equal_opportunity_difference())
    print("Average Odds Difference:", metrics.average_odds_difference())
    print("Disparate Impact:", metrics.disparate_impact())
    print("Statistical Parity Difference:", metrics.statistical_parity_difference())
    print("Equalized Odds Difference:", metrics.equalized_odds_difference())
    # print("Positive Predictive Value:", metrics.positive_predictive_value())
    # print("True Positive Rate Difference:", metrics.true_positive_rate_difference())
    # print("False Positive Rate Difference:", metrics.false_positive_rate_difference())
    print("===== Privileged Group Metrics =====")
    print("TPR: ", metrics.true_positive_rate(privileged=True))
    print("FPR: ", metrics.false_positive_rate(privileged=True))
    print("TNR: ", metrics.true_negative_rate(privileged=True))
    print("FNR: ", metrics.false_negative_rate(privileged=True))
    print("PPV: ", metrics.positive_predictive_value(privileged=True))
    print("===== Unprivileged Group Metrics =====")
    print("TPR: ", metrics.true_positive_rate(privileged=False))
    print("FPR: ", metrics.false_positive_rate(privileged=False))
    print("TNR: ", metrics.true_negative_rate(privileged=False))
    print("FNR: ", metrics.false_negative_rate(privileged=False))
    print("PPV: ", metrics.positive_predictive_value(privileged=False))

# Analyze fairness for different protected attributes
print("\n=== Fairness Analysis ===")

# 1. Race Analysis
protected_attribute_names_race = ['race_African-American', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other']
privileged_groups_race = [{'race_Caucasian': 1}, {'race_Hispanic': 1}]
unprivileged_groups_race = [
    {'race_African-American': 1},
    {'race_Asian': 1},
    # {'race_Hispanic': 1},
    {'race_Native American': 1},
    {'race_Other': 1}
]

# Create datasets for true values and predictions
dataset_true = create_aif_dataset(X_test, y_test_decile, protected_attribute_names_race)
dataset_pred = create_aif_dataset(X_test, y_pred_recid, protected_attribute_names_race)

print("\nRace Fairness Metrics:")
print("Original dataset metrics:")
compute_fairness_metrics(dataset_true, privileged_groups_race, unprivileged_groups_race)
print("\nPrediction metrics:")
compute_classification_metrics(dataset_true, dataset_pred, privileged_groups_race, unprivileged_groups_race)

# 2. Sex Analysis
protected_attribute_names_sex = ['sex_Male']
privileged_groups_sex = [{'sex_Male': 0}]
unprivileged_groups_sex = [{'sex_Male': 1}]

dataset_true = create_aif_dataset(X_test, y_test_decile, protected_attribute_names_sex)
dataset_pred = create_aif_dataset(X_test, y_pred_recid, protected_attribute_names_sex)

print("\nSex Fairness Metrics:")
print("Original dataset metrics:")
compute_fairness_metrics(dataset_true, privileged_groups_sex, unprivileged_groups_sex)
print("\nPrediction metrics:")
compute_classification_metrics(dataset_true, dataset_pred, privileged_groups_sex, unprivileged_groups_sex)

# 3. Age Analysis
protected_attribute_names_age = ['age_cat_Greater than 45', 'age_cat_Less than 25']
privileged_groups_age = [{'age_cat_Greater than 45': 1}]
unprivileged_groups_age = [
    {'age_cat_Less than 25': 1}, 
    # {'age_cat_Greater than 45': 0, 'age_cat_Less than 25': 0}  # Group for 25-45 years
]

# Create datasets for true values and predictions
dataset_true = create_aif_dataset(X_test, y_test_decile, protected_attribute_names_age)
dataset_pred = create_aif_dataset(X_test, y_pred_recid, protected_attribute_names_age)

print("\nAge Fairness Metrics:")
print("Original dataset metrics:")
compute_fairness_metrics(dataset_true, privileged_groups_age, unprivileged_groups_age)
print("\nPrediction metrics:")
compute_classification_metrics(dataset_true, dataset_pred, privileged_groups_age, unprivileged_groups_age)


=== Fairness Analysis ===

Race Fairness Metrics:
Original dataset metrics:
Disparate Impact: 1.3873652566225543
Statistical Parity Difference: 0.14814727582639403

Prediction metrics:
Equal Opportunity Difference: 0.2767348588083069
Average Odds Difference: 0.23412289869487596
Disparate Impact: 1.9552412190788575
Statistical Parity Difference: 0.2804662734240199
Equalized Odds Difference: 0.2767348588083069
===== Privileged Group Metrics =====
TPR:  0.47592067988668557
FPR:  0.18070175438596492
TNR:  0.8192982456140351
FNR:  0.5240793201133145
PPV:  0.6199261992619927
===== Unprivileged Group Metrics =====
TPR:  0.7526555386949925
FPR:  0.37221269296740994
TNR:  0.62778730703259
FNR:  0.2473444613050076
PPV:  0.6956521739130435

Sex Fairness Metrics:
Original dataset metrics:
Disparate Impact: 1.3426392572944297
Statistical Parity Difference: 0.1251453206742879

Prediction metrics:
Equal Opportunity Difference: 0.298961937716263
Average Odds Difference: 0.23057692925486883
Disparate 

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric

def train_and_evaluate_model(X_train, X_test, y_train, y_test, model_name="Base Model"):
    """
    Train XGBoost model and print evaluation metrics
    """
    param_grid = {
        'max_depth': [3],              
        'learning_rate': [0.1],   
        'n_estimators': [200],         
        'min_child_weight': [1],
        'gamma': [0.1],
        'subsample': [1.0],
        'colsample_bytree': [1.0],
        'objective': ['binary:logistic']
    }
    
    xgb_model = XGBClassifier(random_state=42, objective='binary:logistic')
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=5,
        scoring='f1',
        n_jobs=-1,
        verbose=0
    )
    
    print(f"\n=== {model_name} Training and Evaluation ===")
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return grid_search, y_pred

def evaluate_fairness(X_test, y_test, y_pred, model_name="Base Model"):
    """
    Evaluate fairness metrics for different protected attributes
    """
    print(f"\n=== Fairness Metrics for {model_name} ===")
    
    # Race Analysis
    protected_attribute_names_race = ['race_African-American', 'race_Caucasian']
    privileged_groups_race = [{'race_Caucasian': 1}]
    unprivileged_groups_race = [{'race_African-American': 1}]
    
    dataset_true = create_aif_dataset(X_test, y_test, protected_attribute_names_race)
    dataset_pred = create_aif_dataset(X_test, y_pred, protected_attribute_names_race)
    
    print("\nRace Fairness Metrics:")
    compute_classification_metrics(dataset_true, dataset_pred, 
                                privileged_groups_race, 
                                unprivileged_groups_race)
    
    # Sex Analysis
    protected_attribute_names_sex = ['sex_Male']
    privileged_groups_sex = [{'sex_Male': 0}]
    unprivileged_groups_sex = [{'sex_Male': 1}]
    
    dataset_true = create_aif_dataset(X_test, y_test, protected_attribute_names_sex)
    dataset_pred = create_aif_dataset(X_test, y_pred, protected_attribute_names_sex)
    
    print("\nSex Fairness Metrics:")
    compute_classification_metrics(dataset_true, dataset_pred, 
                                privileged_groups_sex, 
                                unprivileged_groups_sex)

def balanced_resample_by_group(X, y, protected_attribute):
    """
    Performs balanced resampling within each protected group
    """
    resampled_X = pd.DataFrame()
    resampled_y = pd.Series(dtype='float64')  # Specify dtype to avoid warning
    
    groups = X[protected_attribute].unique()
    
    for group in groups:
        mask = X[protected_attribute] == group
        X_group = X[mask]
        y_group = y[mask]
        
        oversample = SMOTE(sampling_strategy='auto', random_state=42)
        undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
        
        steps = [('oversample', oversample), ('undersample', undersample)]
        pipeline = Pipeline(steps=steps)
        
        X_res, y_res = pipeline.fit_resample(X_group, y_group)
        
        resampled_X = pd.concat([resampled_X, pd.DataFrame(X_res, columns=X.columns)])
        resampled_y = pd.concat([resampled_y, pd.Series(y_res)])
    
    return resampled_X, resampled_y


# Train and evaluate base model
print("\n=== Base Model ===")
base_model, base_predictions = train_and_evaluate_model(
    X_train, X_test, y_train_decile, y_test_decile, "Base Model"
)
evaluate_fairness(X_test, y_test_decile, base_predictions, "Base Model")


=== Base Model ===

=== Base Model Training and Evaluation ===

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.72      0.71      1153
           1       0.67      0.66      0.67      1012

    accuracy                           0.69      2165
   macro avg       0.69      0.69      0.69      2165
weighted avg       0.69      0.69      0.69      2165


=== Fairness Metrics for Base Model ===

Race Fairness Metrics:
Equal Opportunity Difference: 0.2725040916530278
Average Odds Difference: 0.2475651339360092
Disparate Impact: 1.9816801426115747
Statistical Parity Difference: 0.2998104219327241
Equalized Odds Difference: 0.2725040916530278
===== Privileged Group Metrics =====
TPR:  0.5
FPR:  0.17857142857142858
TNR:  0.8214285714285714
FNR:  0.5
PPV:  0.6460176991150443
===== Unprivileged Group Metrics =====
TPR:  0.7725040916530278
FPR:  0.40119760479041916
TNR:  0.5988023952095808
FNR:  0.22749590834697217
PPV:  0.701337295690

In [21]:
# Train and evaluate model with race-based resampling
print("\n=== Race-Based Resampling Model ===")
X_train_resampled_race, y_train_resampled_race = balanced_resample_by_group(
    X_train, y_train_decile, 'race_African-American'
)
race_model, race_predictions = train_and_evaluate_model(
    X_train_resampled_race, X_test, y_train_resampled_race, y_test_decile,
    "Race-Resampled Model"
)
evaluate_fairness(X_test, y_test_decile, race_predictions, "Race-Resampled Model")


=== Race-Based Resampling Model ===

=== Race-Resampled Model Training and Evaluation ===

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.69      0.69      1153
           1       0.65      0.66      0.66      1012

    accuracy                           0.67      2165
   macro avg       0.67      0.67      0.67      2165
weighted avg       0.67      0.67      0.67      2165


=== Fairness Metrics for Race-Resampled Model ===

Race Fairness Metrics:
Equal Opportunity Difference: 0.026892809900679326
Average Odds Difference: -0.0022677025489474534
Disparate Impact: 1.1078078713499788
Statistical Parity Difference: 0.049533346295936254
Equalized Odds Difference: 0.03142821499857423
===== Privileged Group Metrics =====
TPR:  0.6506849315068494
FPR:  0.33482142857142855
TNR:  0.6651785714285714
FNR:  0.3493150684931507
PPV:  0.5588235294117647
===== Unprivileged Group Metrics =====
TPR:  0.6775777414075287
FPR:  0.3033932135728

In [22]:
# Train and evaluate model with age-based resampling
print("\n=== Age-Based Resampling Model ===")
X_train_resampled_age, y_train_resampled_age = balanced_resample_by_group(
    X_train, y_train_decile, 'age_cat_Greater than 45'
)
age_model, age_predictions = train_and_evaluate_model(
    X_train_resampled_age, X_test, y_train_resampled_age, y_test_decile,
    "Age-Resampled Model"
)
evaluate_fairness(X_test, y_test_decile, age_predictions, "Age-Resampled Model")


=== Age-Based Resampling Model ===

=== Age-Resampled Model Training and Evaluation ===

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.72      0.71      1153
           1       0.66      0.64      0.65      1012

    accuracy                           0.68      2165
   macro avg       0.68      0.68      0.68      2165
weighted avg       0.68      0.68      0.68      2165


=== Fairness Metrics for Age-Resampled Model ===

Race Fairness Metrics:
Equal Opportunity Difference: 0.1955529897092123
Average Odds Difference: 0.17455597161537034
Disparate Impact: 1.6418293936279549
Statistical Parity Difference: 0.22464028776978417
Equalized Odds Difference: 0.1955529897092123
===== Privileged Group Metrics =====
TPR:  0.5376712328767124
FPR:  0.22767857142857142
TNR:  0.7723214285714286
FNR:  0.4623287671232877
PPV:  0.6061776061776062
===== Unprivileged Group Metrics =====
TPR:  0.7332242225859247
FPR:  0.3812375249500998
TNR:  

In [23]:
# Train and evaluate model with sex-based resampling
print("\n=== Sex-Based Resampling Model ===")
X_train_resampled_sex, y_train_resampled_sex = balanced_resample_by_group(
    X_train, y_train_decile, 'sex_Male'
)
sex_model, sex_predictions = train_and_evaluate_model(
    X_train_resampled_sex, X_test, y_train_resampled_sex, y_test_decile,
    "Sex-Resampled Model"
)
evaluate_fairness(X_test, y_test_decile, sex_predictions, "Sex-Resampled Model")


=== Sex-Based Resampling Model ===

=== Sex-Resampled Model Training and Evaluation ===

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.69      0.69      1153
           1       0.65      0.67      0.66      1012

    accuracy                           0.68      2165
   macro avg       0.68      0.68      0.68      2165
weighted avg       0.68      0.68      0.68      2165


=== Fairness Metrics for Sex-Resampled Model ===

Race Fairness Metrics:
Equal Opportunity Difference: 0.24807748357733783
Average Odds Difference: 0.22976033431789616
Disparate Impact: 1.8003736545647202
Statistical Parity Difference: 0.27904919307797005
Equalized Odds Difference: 0.24807748357733783
===== Privileged Group Metrics =====
TPR:  0.5342465753424658
FPR:  0.22767857142857142
TNR:  0.7723214285714286
FNR:  0.4657534246575342
PPV:  0.6046511627906976
===== Unprivileged Group Metrics =====
TPR:  0.7823240589198036
FPR:  0.43912175648702595
TNR

In [24]:
# Train and evaluate model with sex-based resampling (for females)
print("\n=== Sex-Based Resampling Model (Female) ===")
# Create a temporary column for female (where sex_Male = 0)
X_train['sex_Female'] = (X_train['sex_Male'] == 0).astype(int)
X_train_resampled_sex, y_train_resampled_sex = balanced_resample_by_group(
    X_train, y_train_decile, 'sex_Female'
)
# Remove the temporary column before training
X_train_resampled_sex = X_train_resampled_sex.drop('sex_Female', axis=1)

sex_model, sex_predictions = train_and_evaluate_model(
    X_train_resampled_sex, X_test, y_train_resampled_sex, y_test_decile,
    "Sex-Resampled Model (Female)"
)
evaluate_fairness(X_test, y_test_decile, sex_predictions, "Sex-Resampled Model (Female)")


=== Sex-Based Resampling Model (Female) ===

=== Sex-Resampled Model (Female) Training and Evaluation ===

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.69      0.69      1153
           1       0.65      0.67      0.66      1012

    accuracy                           0.68      2165
   macro avg       0.68      0.68      0.68      2165
weighted avg       0.68      0.68      0.68      2165


=== Fairness Metrics for Sex-Resampled Model (Female) ===

Race Fairness Metrics:
Equal Opportunity Difference: 0.24807748357733783
Average Odds Difference: 0.22976033431789616
Disparate Impact: 1.8003736545647202
Statistical Parity Difference: 0.27904919307797005
Equalized Odds Difference: 0.24807748357733783
===== Privileged Group Metrics =====
TPR:  0.5342465753424658
FPR:  0.22767857142857142
TNR:  0.7723214285714286
FNR:  0.4657534246575342
PPV:  0.6046511627906976
===== Unprivileged Group Metrics =====
TPR:  0.7823240589198036
FP

In [25]:
# # Train and evaluate model with sex-based resampling
# print("\n=== Sex-Based Resampling Model ===")
# X_train_resampled_sex, y_train_resampled_sex = balanced_resample_by_group(
#     X_train, y_train_decile, 'sex_Female'
# )
# sex_model, sex_predictions = train_and_evaluate_model(
#     X_train_resampled_sex, X_test, y_train_resampled_sex, y_test_decile,
#     "Sex-Resampled Model"
# )
# evaluate_fairness(X_test, y_test_decile, sex_predictions, "Sex-Resampled Model")