In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import dice_ml
from optbinning import OptimalBinning
from matplotlib import pyplot as plt
from typing import List
import numpy as np
import pandas as pd

In [20]:
CLASSIFIER = LogisticRegression(max_iter=500)
RANDOM_STATE = 37

## Load  UCI Adult Dataset (Census Income)

In [21]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
           "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
           "hours-per-week", "native-country", "income"]
data = pd.read_csv(url , names=columns, skipinitialspace=True, na_values="?")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [22]:
unstandardized_data = data.copy()

## Preprocessing the data

In [23]:
data['income'] = (data['income'].str.strip() == '>50K').astype(int)


categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
continuous_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', CLASSIFIER)])

X = data.drop('income', axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

## Fit the model

In [24]:
clf.fit(X_train, y_train)

## Setup DiCe

In [25]:
dice_data = dice_ml.Data(dataframe=pd.concat([X_train, y_train], axis=1),
                         continuous_features=continuous_features,
                         categorical_features=categorical_features,
                         outcome_name='income')
dice_model = dice_ml.Model(model=clf, backend='sklearn')
exp = dice_ml.Dice(dice_data, dice_model)

## Generate Counterfactuals

In [26]:
print("Query instance:")
X_test.iloc[0:1]

Query instance:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
23751,32,Private,171814,HS-grad,9,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States


In [27]:
query_instance = X_test.iloc[0:1]
dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=3, desired_class="opposite", features_to_vary= ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'hours-per-week'])
cf = dice_exp.cf_examples_list[0].final_cfs_df
cf

100%|██████████| 1/1 [00:00<00:00,  7.30it/s]


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,32,State-gov,171814,HS-grad,9,Married-AF-spouse,Machine-op-inspct,Wife,White,Female,0,0,99,United-States,1
1,32,,171814,HS-grad,9,Married-civ-spouse,Protective-serv,Not-in-family,White,Female,0,0,94,United-States,1
2,32,Private,171814,5th-6th,9,Married-civ-spouse,Sales,Wife,White,Female,0,0,40,United-States,1


## Applying Binning

In [48]:
from typing import List
import numpy as np
import pandas as pd

class GlobalBinner:
    def __init__(self, data: pd.DataFrame, columns: List[str] = None):
        self.data = data
        self.columns = columns or self._get_numeric_columns()
        self.bin_edges = self._compute_bin_edges()

    def _get_numeric_columns(self) -> List[str]:
        """Identifies numeric columns in the data."""
        return self.data.select_dtypes(include=[np.number]).columns.tolist()

    def _compute_bin_edges(self) -> pd.DataFrame:
        """Calculates the 33rd and 67th percentiles for binning."""
        quantiles = self.data[self.columns].quantile([0.33, 0.67])
        return quantiles

    def transform_instance(self, instance: pd.Series) -> pd.Series:
        """Bins a query instance into 'Low', 'Mid', or 'High' ranges."""
        binned_instance = instance.copy()
        for col in self.columns:
            low_edge, high_edge = self.bin_edges[col]
            value = instance[col].values[0]
            if value <= low_edge:
                binned_instance[col] = "Low"
            elif value <= high_edge:
                binned_instance[col] = "Mid"
            else:
                binned_instance[col] = "High"
        return binned_instance

    def transform_instance_with_values(self, instance: pd.Series) -> pd.Series:
        """Bins a query instance and includes the original values.""" 
        binned_instance = instance.copy()
        for col in self.columns:
            low_edge, high_edge = self.bin_edges[col]
            value = instance[col].values[0]
            if value <= low_edge:
                binned_instance[col] = f"{value} (Low)"
            elif value <= high_edge:
                binned_instance[col] = f"{value} (Mid)"
            else:
                binned_instance[col] = f"{value} (High)"
        return binned_instance
    
global_binner = GlobalBinner(unstandardized_data)
binned_query_instance = global_binner.transform_instance_with_values(query_instance)
binned_query_instance

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
23751,32 (Mid),Private,171814 (Mid),HS-grad,9 (Low),Never-married,Machine-op-inspct,Not-in-family,White,Female,0 (Low),0 (Low),40 (Low),United-States


In [None]:
class LocalBinner:
    def __init__(self, data: pd.DataFrame, columns: List[str] = None, threshold: float = 0.3):
        self.data = data
        self.columns = columns or self._get_numeric_columns()
        self.data_stats = self._compute_data_stats()
        self.threshold = threshold

    def _get_numeric_columns(self) -> List[str]:
        return self.data.select_dtypes(include=[np.number]).columns.tolist()

    def _compute_data_stats(self) -> pd.DataFrame:
        return self.data[self.columns].describe().loc[['min', 'max']]

    def transform_relative(self, counterfactual: pd.DataFrame, query_instance: pd.Series) -> pd.DataFrame:
        binned_cf = counterfactual.copy()
        for col in self.columns:
            query_value = query_instance[col]
            cf_values = counterfactual[col]
            data_min, data_max = self.data_stats[col]
            data_range = data_max - data_min

            relative_changes = (cf_values - query_value) / data_range
            change_directions = np.where(relative_changes >= 0, 'Higher', 'Lower')

            significant_change = np.abs(relative_changes) > self.threshold

            binned_cf[col] = [
                f"{value} (Much {direction})" if change and significant
                else f"{value} ({direction})" if change
                else f"{value} (unchanged)"
                for value, direction, change, significant in zip(cf_values, change_directions, relative_changes, significant_change)
            ]
        return binned_cf
    
binner = LocalBinner(unstandardized_data, continuous_features)
binned_cf = binner.transform_relative(cf, query_instance.squeeze())
binned_cf

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,32 (unchanged),Private,171814 (unchanged),HS-grad,9 (unchanged),Married-civ-spouse,Exec-managerial,Not-in-family,White,Female,0 (unchanged),0 (unchanged),85 (Much Higher),United-States,1
1,32 (unchanged),Private,171814 (unchanged),HS-grad,9 (unchanged),Married-AF-spouse,Exec-managerial,Not-in-family,White,Female,0 (unchanged),0 (unchanged),92 (Much Higher),United-States,1
2,32 (unchanged),Private,171814 (unchanged),HS-grad,9 (unchanged),Married-civ-spouse,Exec-managerial,Not-in-family,White,Female,0 (unchanged),0 (unchanged),86 (Much Higher),United-States,1


In [None]:
unstandardized_data['income'] = (unstandardized_data['income'].str.strip() == '>50K').astype(int)
optb = OptimalBinning(name="age", dtype="numerical", solver="cp", max_n_bins=3)
optb.fit(unstandardized_data["age"], unstandardized_data["income"])
binning_table = optb.binning_table.build()

In [None]:
binning_table

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 21.50)",3130,0.096127,3125,5,0.001597,5.289505,0.665304,0.042027
1,"[21.50, 29.50)",6581,0.202113,6075,506,0.076888,1.337154,0.242319,0.028218
2,"[29.50, inf)",22850,0.70176,15520,7330,0.320788,-0.398092,0.122214,0.015177
3,Special,0,0.0,0,0,0.0,0.0,0.0,0.0
4,Missing,0,0.0,0,0,0.0,0.0,0.0,0.0
Totals,,32561,1.0,24720,7841,0.24081,,1.029837,0.085421
