 # Defines

 ## Libraries

In [1]:
import pickle

import dice_ml
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

# show all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

 ## Data

In [2]:
# Load data
with open('output/data.pkl', 'rb') as f:
    data = pickle.load(f)

X_train = data['X_train']
y_train_true = data['y_train_true']
y_train_compas = data['y_train_compas']
X_test = data['X_test']
y_test_true = data['y_test_true']
y_test_compas = data['y_test_compas']

# Load model
with open('output/model.pkl', 'rb') as f:
    model = pickle.load(f)

In [3]:
protected_features = ["is_black", "is_male", "age"]

In [4]:
# Print the shapes of the train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

print("y_train_compas shape:", y_train_compas.shape)
print("y_test_compas shape:", y_test_compas.shape)

print("y_train_true shape:", y_train_true.shape)
print("y_test_true shape:", y_test_true.shape)

X_train shape: (4937, 9)
X_test shape: (1235, 9)
y_train_compas shape: (4937,)
y_test_compas shape: (1235,)
y_train_true shape: (4937,)
y_test_true shape: (1235,)


# Create Counterfactual

In [5]:
pipeline = Pipeline(steps=[("model", model)])
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

continuous_features =  list(X_test.columns)

THRESHOLD = 0.5

feature_weights = {
    "is_male": 0.1,
    "age": 0.1,
    "juv_fel_count": 0.1,
    "juv_misd_count": 0.1,
    "juv_other_count": 0.1,
    "priors_count": 0,
    "days_b_screening_arrest": 0.1,
    "is_felony": 0.1,
    "is_black": 1,
}

In [None]:
# Assuming df_original, y_pred_proba, THRESHOLD, continuous_features, pipeline, and X_test are already defined

df_original = X_test.copy()
df_original["proba"] = y_pred_proba
df_original["y_pred"] = np.where(df_original["proba"] > THRESHOLD, 1, 0)

dice_data = dice_ml.Data(
    dataframe=df_original.drop(columns=["proba"]),
    continuous_features=continuous_features,
    outcome_name="y_pred",
)
dice_model = dice_ml.Model(model=pipeline, backend="sklearn")
cf_generator = dice_ml.Dice(dice_data, dice_model, method="random")

results = []

for i, (index, original_row_series) in enumerate(df_original.iterrows()):
    cf_input_dict = original_row_series.to_dict()

    cf_input_dict.pop("y_pred", None)
    cf_input_dict.pop("proba", None)

    cf_input_df = pd.DataFrame([cf_input_dict])

    counterfactuals = cf_generator.generate_counterfactuals(
        cf_input_df, total_CFs=5, desired_class="opposite",
    )

    cf_df = counterfactuals.cf_examples_list[0].final_cfs_df
    cf_df["cf_y_pred_proba"] = pipeline.predict_proba(cf_df.drop(columns=["y_pred"]))[:, 1]
    cf_df["cf_y_pred"] = np.where(cf_df["cf_y_pred_proba"] > THRESHOLD, 1, 0)

    cf_df["original_index"]        = index
    cf_df["original_is_black"]     = original_row_series["is_black"].astype(int)
    cf_df["original_y_pred_proba"] = original_row_series["proba"]
    cf_df["original_y_pred"]       = original_row_series["y_pred"]

    results.append(cf_df)
    # break

# Convert the results list to a dataframe
cf_results_df = pd.concat(results, ignore_index=True)

In [7]:
# to pickle
with open('output/cf_results_df.pkl', 'wb') as f:
    pickle.dump(cf_results_df, f)
# cf_results_df.to_csv("output/cf_results_df.csv", index=False)

In [8]:
# read cf_resuls_df file
with open('output/cf_results_df.pkl', 'rb') as f:
    cf_results_df = pickle.load(f)

In [9]:
cf_results_df

Unnamed: 0,is_male,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,is_felony,is_black,y_pred,cf_y_pred_proba,cf_y_pred,original_index,original_is_black,original_y_pred,original_y_pred_proba
0,1.0,26.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0,0.497329,0,2681,1,1.0,0.650344
1,1.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.379920,0,2681,1,1.0,0.650344
2,1.0,26.0,0.0,7.0,0.0,0.0,0.0,0.0,1.0,0,0.383554,0,2681,1,1.0,0.650344
3,1.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.347113,0,2681,1,1.0,0.650344
4,1.0,62.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0,0.497334,0,2681,1,1.0,0.650344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6170,1.0,56.0,0.0,11.0,0.0,2.0,0.0,1.0,1.0,0,0.400040,0,4157,1,1.0,0.576588
6171,1.0,68.0,0.0,0.0,0.0,2.0,-19.1,1.0,1.0,0,0.359985,0,4157,1,1.0,0.576588
6172,1.0,64.0,0.0,0.0,3.0,2.0,0.0,1.0,1.0,0,0.414306,0,4157,1,1.0,0.576588
6173,1.0,60.0,0.0,0.0,0.0,2.0,-24.2,1.0,1.0,0,0.362884,0,4157,1,1.0,0.576588


## Change only is_black

In [10]:
# Assuming df_original, y_pred_proba, THRESHOLD, continuous_features, pipeline, and X_test are already defined

cf_is_black_df = X_test.copy()

# counterfactual using just "is_black" feature
cf_is_black_df["is_black"] = 1 - X_test.is_black

cf_is_black_df["cf_y_pred_proba"] = pipeline.predict_proba(cf_is_black_df)[:, 1]
cf_is_black_df["cf_y_pred"] = np.where(
    cf_is_black_df["cf_y_pred_proba"] > THRESHOLD, 1, 0
)

cf_is_black_df["original_index"] = X_test.index
cf_is_black_df["original_is_black"] = X_test.is_black
cf_is_black_df["original_y_pred_proba"] = y_pred_proba
cf_is_black_df["original_y_pred"] = np.where(
    cf_is_black_df["original_y_pred_proba"] > THRESHOLD, 1, 0
)

In [11]:
# to pickle
with open('output/cf_is_black_only_results_df.pkl', 'wb') as f:
    pickle.dump(cf_is_black_df, f)

In [12]:
# read cf_resuls_df file
with open('output/cf_is_black_only_results_df.pkl', 'rb') as f:
    cf_is_black_df = pickle.load(f)

In [13]:
cf_is_black_df

Unnamed: 0,is_male,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,is_felony,is_black,cf_y_pred_proba,cf_y_pred,original_index,original_is_black,original_y_pred_proba,original_y_pred
2681,1,26,0,0,0,13,0.0,0,0,0.620667,1,2681,1,0.650344,1
2164,1,31,0,1,0,1,0.0,0,1,0.424510,0,2164,0,0.401980,0
3875,1,27,0,0,0,2,0.0,1,0,0.492163,0,3875,1,0.520566,1
416,1,21,0,0,0,0,0.0,0,1,0.481147,0,416,0,0.458236,0
2030,0,24,0,0,0,1,-1.0,1,1,0.452004,0,2030,0,0.441711,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6773,1,67,0,0,0,0,0.0,0,1,0.286673,0,6773,0,0.263264,0
4439,1,27,0,1,0,9,-1.0,1,0,0.622438,1,4439,1,0.642877,1
1292,1,23,0,0,0,6,-1.0,1,1,0.618545,1,1292,0,0.593352,1
4200,1,31,0,0,0,0,-1.0,1,0,0.371023,0,4200,1,0.398349,0
