# Tutorial: Counterfactual explanations for scorecard with binary target

Comment introduction

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

from optbinning import BinningProcess
from optbinning import Scorecard
from optbinning.scorecard import Counterfactual

This dataset has 8 features. The outcome is income which is binarized to 0 (low-income, <=50K) or 1 (high-income, >50K).

In [2]:
df = pd.read_csv("data/adult.data", sep=",", header=None)

columns = ["age", "workclass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex",
           "capital-gain", "capital-loss", "hours-per-week","native-country",
           "income"]

target = "income"

variable_names = ["age", "workclass", "education", "marital-status",
                  "occupation", "race", "sex", "hours-per-week"]

df.columns = columns
df = df[variable_names  + [target]]
df[target] = (df[target].values == ' >50K').astype(int)

#### Scorecard model

Comment

In [3]:
binning_process = BinningProcess(variable_names)

estimator = LogisticRegression(solver="lbfgs", class_weight="balanced")

scorecard = Scorecard(target=target, binning_process=binning_process,
                      estimator=estimator, scaling_method="min_max",
                      scaling_method_params={"min": 300, "max": 850})

scorecard.fit(df)

Scorecard(binning_process=BinningProcess(binning_fit_params=None,
                                         binning_transform_params=None,
                                         categorical_variables=None,
                                         max_bin_size=None, max_n_bins=None,
                                         max_n_prebins=20, max_pvalue=None,
                                         max_pvalue_policy='consecutive',
                                         min_bin_size=None, min_n_bins=None,
                                         min_prebin_size=0.05, n_jobs=None,
                                         selection_criteria=None,
                                         special_codes=None, split_digits=None,
                                         v...
                                       dual=False, fit_intercept=True,
                                       intercept_scaling=1, l1_ratio=None,
                                       max_iter=100, multi_class='auto',
  

#### Generating counterfactual explanations - binary outcome

##### Single counterfactual

Comment

In [4]:
idx_lowest = np.argmin(scorecard.predict_proba(df)[:, 1])
query = df.iloc[idx_lowest, :-1].to_dict()

In [5]:
query

{'age': 17,
 'workclass': ' Private',
 'education': ' 11th',
 'marital-status': ' Never-married',
 'occupation': ' Other-service',
 'race': ' Black',
 'sex': ' Female',
 'hours-per-week': 25}

Comment

In [6]:
cf = Counterfactual(scorecard=scorecard)

In [7]:
cf.fit(df)

Counterfactual(n_jobs=1,
               scorecard=Scorecard(binning_process=BinningProcess(binning_fit_params=None,
                                                                  binning_transform_params=None,
                                                                  categorical_variables=None,
                                                                  max_bin_size=None,
                                                                  max_n_bins=None,
                                                                  max_n_prebins=20,
                                                                  max_pvalue=None,
                                                                  max_pvalue_policy='consecutive',
                                                                  min_bin_size=None,
                                                                  min_n_bins=None,
                                                                  min_prebin_size=0.05,
   

Comment

In [8]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=1, max_changes=3)

Counterfactual(n_jobs=1,
               scorecard=Scorecard(binning_process=BinningProcess(binning_fit_params=None,
                                                                  binning_transform_params=None,
                                                                  categorical_variables=None,
                                                                  max_bin_size=None,
                                                                  max_n_bins=None,
                                                                  max_n_prebins=20,
                                                                  max_pvalue=None,
                                                                  max_pvalue_policy='consecutive',
                                                                  min_bin_size=None,
                                                                  min_n_bins=None,
                                                                  min_prebin_size=0.05,
   

Comment

In [9]:
cf.status

'OPTIMAL'

In [10]:
cf.information(print_level=2)

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Begin options
    scorecard                            yes   * U
    special_missing                    False   * d
    n_jobs                                 1   * d
    verbose                            False   * d
  End options

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                   88
    Number of constraints                 42
    Objective value                  16.1490
    Best objective bound             16.1490

  Objectives
    proximity                         3.7558
    closeness                        12.3932

  Timing
    Total time                          0.22 sec
    Fit                                 0.13 sec   ( 58.29%)
    Solver                              0.08 sec   ( 36.68%)
    Post-processing                     0.01 sec   ( 13.71%)



Comment

In [11]:
cf.display()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week
0,"[33.50, 35.50)",Private,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]",Other-service,Black,Female,25


In [12]:
cf.display(show_only_changes=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week
0,"[33.50, 35.50)",-,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]",-,-,-,-


In [13]:
cf.display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[33.50, 35.50)",-,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]",-,-,-,-,0.505995


##### Actionable features

Comment

In [14]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=1, max_changes=4,
            actionable_features=["age", "workclass", "education", "occupation",
                                 "hours-per-week"]
           ).display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[43.50, 49.50)",-,"[ Masters, Prof-school, Doctorate]",-,"[ Tech-support, Protective-serv, Prof-specia...",-,-,"[39.50, 41.50)",0.503416


##### Weighted vs hierarchical method

Comment

In [15]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=1,
            method="weighted", max_changes=4).display()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week
0,"[37.50, 40.50)",Private,[ Bachelors],"[ Married-AF-spouse, Married-civ-spouse]","[ ?, Armed-Forces, Farming-fishing]",Black,Female,25


In [16]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                   88
    Number of constraints                 42
    Objective value                  14.0037
    Best objective bound             14.0024

  Objectives
    proximity                         4.1646
    closeness                         9.8391

  Timing
    Total time                          0.30 sec
    Fit                                 0.13 sec   ( 43.44%)
    Solver                              0.16 sec   ( 54.23%)
    Post-processing                     0.01 sec   (  4.28%)



In [17]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=1,
            method="weighted", objectives={"proximity": 0.1, "closeness": 0.9},
            max_changes=4
           ).display()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week
0,"[43.50, 49.50)",Private,"[ Masters, Prof-school, Doctorate]",Never-married,"[ Tech-support, Protective-serv, Prof-specia...",Black,Female,"[39.50, 41.50)"


In [18]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                   88
    Number of constraints                 42
    Objective value                   9.0395
    Best objective bound              9.0395

  Objectives
    proximity                         5.5872
    closeness                         9.4231

  Timing
    Total time                          0.39 sec
    Fit                                 0.13 sec   ( 33.32%)
    Solver                              0.25 sec   ( 64.93%)
    Post-processing                     0.01 sec   (  2.70%)



Comment

In [19]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=1,
            method="hierarchical", max_changes=4
           ).display()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week
0,"[35.50, 37.50)",Private,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]",Other-service,Black,Female,"[25.50, 34.50)"


In [20]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                   90
    Number of constraints                 44
    Objective value                  12.0849
    Best objective bound             12.0849

  Objectives
    proximity                         3.9645
    closeness                        12.0849

  Timing
    Total time                          0.22 sec
    Fit                                 0.13 sec   ( 60.64%)
    Solver                              0.08 sec   ( 35.59%)
    Post-processing                     0.01 sec   ( 10.60%)



In [21]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=1,
            method="hierarchical", objectives={"proximity": 1, "closeness": 2},
            max_changes=4
           ).display()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week
0,"[37.50, 40.50)",Private,[ Bachelors],"[ Married-AF-spouse, Married-civ-spouse]","[ ?, Armed-Forces, Farming-fishing]",Black,Female,25


In [22]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                   90
    Number of constraints                 44
    Objective value                   4.1646
    Best objective bound              4.1646

  Objectives
    closeness                         9.8391
    proximity                         4.1646

  Timing
    Total time                          0.42 sec
    Fit                                 0.13 sec   ( 30.90%)
    Solver                              0.29 sec   ( 67.54%)
    Post-processing                     0.01 sec   (  2.31%)



##### Multiple counterfactuals

Comment 1

Comment 2: hard constraints

In [23]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=3, max_changes=4,
            hard_constraints=["diversity_features"], time_limit=5
           ).display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[43.50, 49.50)",-,"[ Masters, Prof-school, Doctorate]",-,[ Sales],-,-,"[41.50, 49.50)",0.514491
0,"[40.50, 43.50)",-,[ Bachelors],"[ Married-AF-spouse, Married-civ-spouse]",-,-,-,"[34.50, 39.50)",0.537689
0,"[37.50, 40.50)",-,[ Bachelors],"[ Married-AF-spouse, Married-civ-spouse]","[ ?, Armed-Forces, Farming-fishing]",-,-,-,0.508597


In [24]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : FEASIBLE                        

  Solver statistics
    Type                                 mip
    Number of variables                  408
    Number of constraints                957
    Objective value                  44.0018
    Best objective bound             40.1499

  Objectives
    proximity                        14.1411
    closeness                        29.8606

  Timing
    Total time                          5.29 sec
    Fit                                 0.13 sec   (  2.47%)
    Solver                              5.15 sec   ( 97.21%)
    Post-processing                     0.02 sec   (  0.33%)



Comment - increase time limit

In [25]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=3, max_changes=4,
            hard_constraints=["diversity_features"], time_limit=15
           ).display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[43.50, 49.50)",-,"[ Masters, Prof-school, Doctorate]",-,"[ Tech-support, Protective-serv, Prof-specia...",-,-,"[39.50, 41.50)",0.503416
0,"[40.50, 43.50)",-,[ Bachelors],"[ Married-AF-spouse, Married-civ-spouse]",-,-,-,"[34.50, 39.50)",0.537689
0,"[37.50, 40.50)",-,[ Bachelors],"[ Married-AF-spouse, Married-civ-spouse]","[ ?, Armed-Forces, Farming-fishing]",-,-,-,0.508597


In [26]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                  408
    Number of constraints                957
    Objective value                  43.7051
    Best objective bound             43.7051

  Objectives
    proximity                        14.0688
    closeness                        29.6363

  Timing
    Total time                         10.66 sec
    Fit                                 0.13 sec   (  1.23%)
    Solver                             10.51 sec   ( 98.61%)
    Post-processing                     0.02 sec   (  0.17%)



Comment about the first counterfactual

In [27]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=3, max_changes=4,
            hard_constraints=["diversity_features", "diversity_values"], time_limit=15
           ).display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[40.50, 43.50)",-,"[ Assoc-acdm, Assoc-voc]","[ Married-AF-spouse, Married-civ-spouse]",-,-,-,"[41.50, 49.50)",0.52943
0,"[49.50, 54.50)",-,"[ Masters, Prof-school, Doctorate]","[ Married-spouse-absent, Widowed, Divorced]","[ Tech-support, Protective-serv, Prof-specia...",-,-,-,0.510208
0,"[43.50, 49.50)",-,[ Bachelors],-,[ Exec-managerial],-,-,"[49.50, 55.50)",0.518267


In [28]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                  408
    Number of constraints                981
    Objective value                  47.7067
    Best objective bound             47.7067

  Objectives
    proximity                        16.0514
    closeness                        31.6553

  Timing
    Total time                         10.91 sec
    Fit                                 0.13 sec   (  1.20%)
    Solver                             10.76 sec   ( 98.63%)
    Post-processing                     0.02 sec   (  0.17%)



#### Generating counterfactual explanations - probability outcome

Comment

In [29]:
df_query = pd.DataFrame([query],columns=query.keys())

In [30]:
scorecard.predict_proba(df_query)

array([[9.99831890e-01, 1.68109773e-04]])

Comment

In [31]:
cf.generate(query=df_query, y=0.7, outcome_type="probability", n_cf=2, max_changes=4,
            hard_constraints=["diversity_features"],
            soft_constraints={"diff_outcome": 1},
            ).display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[23.50, 25.50)",-,[ HS-grad],-,"[ ?, Armed-Forces, Farming-fishing]",-,-,"[25.50, 34.50)",0.007926
0,"[23.50, 25.50)",-,[ HS-grad],-,"[ ?, Armed-Forces, Farming-fishing]",-,-,-,0.007168


Comment

In [32]:
cf.generate(query=df_query, y=0.7, outcome_type="probability", n_cf=2, max_changes=4,
            hard_constraints=["diversity_features"],
            soft_constraints={"diff_outcome": 100}
           ).display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[40.50, 43.50)",-,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]",-,-,-,"[34.50, 39.50)",0.69751
0,"[43.50, 49.50)",-,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]","[ ?, Armed-Forces, Farming-fishing]",-,-,-,0.702461


In [33]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                  290
    Number of constraints                493
    Objective value                  32.4569
    Best objective bound             32.4569

  Objectives
    proximity                         9.1380
    closeness                        22.6536
    diff_outcome                      0.0067

  Timing
    Total time                          2.12 sec
    Fit                                 0.13 sec   (  6.17%)
    Solver                              1.98 sec   ( 93.25%)
    Post-processing                     0.01 sec   (  0.63%)



Comment

In [34]:
cf.generate(query=df_query, y=0.7, outcome_type="probability", n_cf=2, max_changes=4,
            hard_constraints=["diversity_features", "min_outcome"]
           ).display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[49.50, 54.50)",-,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]",-,-,-,"[34.50, 39.50)",0.733405
0,"[49.50, 54.50)",-,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]","[ ?, Armed-Forces, Farming-fishing]",-,-,-,0.715094


In [35]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                  290
    Number of constraints                493
    Objective value                  31.7672
    Best objective bound             31.7672

  Objectives
    proximity                         9.2342
    closeness                        22.5330

  Timing
    Total time                          1.39 sec
    Fit                                 0.13 sec   (  9.42%)
    Solver                              1.25 sec   ( 89.67%)
    Post-processing                     0.01 sec   (  1.01%)



In [36]:
cf.generate(query=df_query, y=0.7, outcome_type="probability", n_cf=2,
            max_changes=4, method="hierarchical",
            objectives={"proximity": 2, "closeness": 1},
            hard_constraints=["min_outcome"],
            soft_constraints={"diversity_features": 2, "diversity_values": 1}
           ).display(show_only_changes=True, show_outcome=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,"[49.50, 54.50)",-,"[ Masters, Prof-school, Doctorate]","[ Married-AF-spouse, Married-civ-spouse]","[ ?, Armed-Forces, Farming-fishing]",-,-,-,0.715094
0,"[40.50, 43.50)",-,[ Bachelors],"[ Married-AF-spouse, Married-civ-spouse]",-,-,-,"[49.50, 55.50)",0.718292


In [37]:
cf.information()

optbinning (Version 0.11.0)
Copyright (c) 2019-2021 Guillermo Navas-Palencia, Apache License 2.0

  Status  : OPTIMAL                         

  Solver statistics
    Type                                 mip
    Number of variables                  296
    Number of constraints                498
    Objective value                  -6.0000
    Best objective bound             -6.0000

  Objectives
    proximity                         9.9447
    diversity_features                2.0000
    closeness                        23.7715
    diversity_values                  6.0000

  Timing
    Total time                          3.07 sec
    Fit                                 0.13 sec   (  4.26%)
    Solver                              2.92 sec   ( 95.33%)
    Post-processing                     0.01 sec   (  0.42%)



#### Special and missing bins

In [38]:
cf = Counterfactual(scorecard=scorecard, special_missing=True)

In [39]:
cf.fit(df)

Counterfactual(n_jobs=1,
               scorecard=Scorecard(binning_process=BinningProcess(binning_fit_params=None,
                                                                  binning_transform_params=None,
                                                                  categorical_variables=None,
                                                                  max_bin_size=None,
                                                                  max_n_bins=None,
                                                                  max_n_prebins=20,
                                                                  max_pvalue=None,
                                                                  max_pvalue_policy='consecutive',
                                                                  min_bin_size=None,
                                                                  min_n_bins=None,
                                                                  min_prebin_size=0.05,
   

In [40]:
cf.generate(query=query, y=1, outcome_type="binary", n_cf=1, max_changes=4
           ).display(show_only_changes=True)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week
0,"[49.50, 54.50)",-,"[ Assoc-acdm, Assoc-voc]","[ Married-AF-spouse, Married-civ-spouse]",-,-,-,Missing
