# Working on Group-CFEs

### Using Datasets from; Retiring Adult: New Datasets for Fair Machine Learning (https://papers.nips.cc/paper/2021/file/32e54441e6382a7fbacbbbaf3c450059-Paper.pdf)


## Data Prep

In [1]:
import numpy as np 
import pandas as pd
import alibi
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import GradientBoostingClassifier
from scipy.spatial import distance
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier, KDTree
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
import dice_ml
from dice_ml.utils import helpers # helper functions

In [3]:
from alibi.datasets import fetch_adult

### Loading the Adult Dataset

In [4]:
adult = fetch_adult()
data = adult.data
target = adult.target
feature_names = adult.feature_names
category_map_tmp = adult.category_map
target_names = adult.target_names

In [5]:
def set_seed(s=0):
    np.random.seed(s)
    tf.random.set_seed(s)

In [6]:
import tensorflow as tf

In [7]:
set_seed()
data_perm = np.random.permutation(np.c_[data, target])
X = data_perm[:,:-1]
y = data_perm[:,-1]

In [8]:
idx = 30000
y_train, y_test = y[:idx], y[idx+1:]

In [9]:
X = np.c_[X[:, 1:8], X[:, 11], X[:, 0], X[:, 8:11]]


In [10]:
feature_names = feature_names[1:8] + feature_names[11:12] + feature_names[0:1] + feature_names[8:11]
print(feature_names)

['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week']


In [11]:
category_map = {}
for i, (_, v) in enumerate(category_map_tmp.items()):
    category_map[i] = v

In [12]:
#ED check ordering of feature values for categorical. For education. 

In [13]:
cat_vars_ord = {}
n_categories = len(list(category_map.keys()))
for i in range(n_categories):
    cat_vars_ord[i] = len(np.unique(X[:, i]))
print(cat_vars_ord)

{0: 9, 1: 7, 2: 4, 3: 9, 4: 6, 5: 5, 6: 2, 7: 11}


In [14]:
X_num = X[:, -4:].astype(np.float32, copy=False)
xmin, xmax = X_num.min(axis=0), X_num.max(axis=0)
rng = (-1., 1.)
X_num_scaled = (X_num - xmin) / (xmax - xmin) * (rng[1] - rng[0]) + rng[0]
#X_num_scaled_train = X_num_scaled[:idx, :]
#X_num_scaled_test = X_num_scaled[idx+1:, :]


In [15]:
#X = np.c_[X[:, :-4], X_num_scaled].astype(np.float32, copy=False)
X_train, X_test = X[:idx, :], X[idx+1:, :]
print(X_train.shape, X_test.shape)


(30000, 12) (2560, 12)


In [16]:
model = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=0))
model.fit(X_train, y_train)
yhat = model.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
accuracy_score(yhat, y_test)

0.87421875

In [19]:
len(np.where(model.predict(X_train) != y_train)[0]) #these are cases that may be troublesome for NNs and NUNs for explanation

3972

## Counterfactuals

### A simple baseline; NUNs - Nearest Unlike Neighbors

In [20]:
pd.DataFrame(X_train, columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week'])

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week
0,4,4,2,1,4,4,0,9,27,0,0,44
1,4,1,1,5,1,4,0,9,27,0,0,40
2,4,0,0,6,0,4,1,9,25,0,0,40
3,4,3,0,2,0,0,1,9,46,0,1902,40
4,4,3,2,2,1,4,1,9,45,0,2824,76
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,6,4,2,5,1,4,0,9,41,0,0,15
29996,0,3,3,0,4,2,0,9,53,0,0,30
29997,4,4,1,7,1,4,1,9,20,0,0,40
29998,4,4,0,2,5,2,0,9,32,3908,0,40


#### NUN instances where people make 50k + in the training data

In [21]:
negative_outcome = [X_test[instance] for instance in np.where(yhat == False)][0] # the people in the test set who are predicted to make less than 50k
positive_outcome = [X_test[instance] for instance in np.where(yhat == True)][0] # the people in the test set who are predicted to make more than 50k

positive_train_set = [X_train[instance] for instance in np.where(y_train == True)][0] # the people who make 50k in the train set
negative_train_set = positive = [X_train[instance] for instance in np.where(y_train == False)][0] # the people who dont make 50k in the train set

In [22]:
# regularization deugging only keep cases where model.pred(X_train) ==ytrain

In [23]:
positive_train_indices = sorted(set(np.where(y_train==True)[0]).intersection(set(np.where(model.predict(X_train) == y_train)[0])))
positive_train_set = [X_train[instance] for instance in positive_train_indices]
positive_train_set = np.array(positive_train_set)

In [24]:
negative_train_set_indices = sorted(set(np.where(y_train==False)[0]).intersection(set(np.where(model.predict(X_train) == y_train)[0])))
negative_train_set = [X_train[instance] for instance in negative_train_set_indices]
negative_train_set = np.array(negative_train_set)

In [25]:
#removing duplicates 

In [26]:
positive_train_set = np.array([list(instance) for instance in set(map(tuple, positive_train_set.tolist()))])
negative_train_set = np.array([list(instance) for instance in set(map(tuple, negative_train_set.tolist()))])

In [27]:
np.where(y_train == True)[0].shape, np.where(y_train == False)[0].shape #NB some class imbalance

((7229,), (22771,))

In [28]:
index = 0
neighbors_positive = NearestNeighbors(n_neighbors=30, metric='hamming').fit(positive_train_set) #nb could do with a better distance function
distances, indices = neighbors_positive.kneighbors(X_test[index].reshape(1,-1))

list(X_test[index]), list(positive_train_set[indices[0][0]]) # a NUN

([4, 0, 2, 8, 4, 2, 0, 9, 52, 0, 0, 60],
 [4, 5, 2, 8, 4, 2, 0, 9, 39, 15020, 0, 60])

#### Finding NNs

NB might use a custom distance function

In [29]:
neighbors_negative = NearestNeighbors(n_neighbors=30, metric='hamming').fit(negative_train_set) # other instances that dont get 50k   

In [30]:
def NUN_finder(query, outcome):
    
    if outcome == 'negative':
        #ED rename to neighbors positive
        distances, indices = neighbors_positive.kneighbors(query.reshape(1,-1))
        NUN = positive_train_set[indices[0][0]]
        
    elif outcome == 'positive':
            distances, indices = neighbors_negative.kneighbors(query.reshape(1,-1))
            NUN = negative_train_set[indices[0][0]]
            
    return list(NUN)

In [31]:
def explanation_generator(query, outcome): # a query predicted to be under 50k 
    
    if outcome == 'negative':
        
        query = query
        distances_neg, indices_neg = neighbors_negative.kneighbors(query.reshape(1,-1))
        NNs = (negative_train_set[indices_neg[0][0:5]])

        distances, indices = neighbors_positive.kneighbors(query.reshape(1,-1))
        NUN = positive_train_set[indices[0][0]]

        NUNs = []
        for instance in NNs:
            NUNs.append(NUN_finder(instance, outcome='negative'))

        return query, NUN, NNs, NUNs, indices_neg #return the query, NUN, the NN's in the same class and also the corresponding NUNs
    
    elif outcome == 'positive':
        
        query = query
        distances_pos, indices_pos = neighbors_positive.kneighbors(query.reshape(1,-1))
        NNs = (positive_train_set[indices_pos[0][0:5]])

        distances_neg, indices_neg = neighbors_negative.kneighbors(query.reshape(1,-1))
        NUN = negative_train_set[indices_neg[0][0]]

        NUNs = []
        for instance in NNs:
            NUNs.append(NUN_finder(instance, outcome='positive'))

        return query, NUN, NNs, NUNs, indices_pos #return the query, NUN, the NN's in the same class and also the corresponding NUNs
    


In [32]:
def boarderline_cases(threshold):
    
    max_proba = []
    for instance in range(X_test.shape[0]):
        max_proba.append(model.predict_proba(X_test[instance].reshape(1,-1)).max())
    
    boarderline_cases = np.where(np.array(max_proba) <= threshold)[0]
    
    intersection = list(np.intersect1d(boarderline_cases, np.where(yhat != y_test)[0]))
    #ED FIXED NOW BUT Intersection is the not correctly predictted. Think about this. 
    return np.array(sorted(list(set(boarderline_cases) - set(intersection))))

## DiCE Counterfactuals

In [33]:
# DiCE imports
import dice_ml
from dice_ml.utils import helpers  # helper functions

In [34]:
# Getting dataset ready using pandas

x_train = pd.DataFrame(X_train, columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week'])
x_train['income'] = y_train

x_test = pd.DataFrame(X_test, columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week'])
x_test['income'] = y_test
x_test = x_test.drop('income', axis=1)

x_train = x_train.drop('income', axis=1)
#x_test = test_dataset.drop('income', axis=1)

In [35]:
x_train

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week
0,4,4,2,1,4,4,0,9,27,0,0,44
1,4,1,1,5,1,4,0,9,27,0,0,40
2,4,0,0,6,0,4,1,9,25,0,0,40
3,4,3,0,2,0,0,1,9,46,0,1902,40
4,4,3,2,2,1,4,1,9,45,0,2824,76
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,6,4,2,5,1,4,0,9,41,0,0,15
29996,0,3,3,0,4,2,0,9,53,0,0,30
29997,4,4,1,7,1,4,1,9,20,0,0,40
29998,4,4,0,2,5,2,0,9,32,3908,0,40


Given the train dataset, we construct a data object for DiCE. Since continuous and discrete features have different ways of perturbation, we need to specify the names of the continuous features. DiCE also requires the name of the output variable that the ML model will predict.

In [36]:
type(x_train.iloc[0][10]) 

x_train = x_train.astype({'Capital Loss':'int32'})
x_test = x_test.astype({'Capital Loss':'int32'})
# ED change capital loss to int32 and not a string (for both train set and test set)

In [37]:
# Step 1: dice_ml.Data
# put back in capital loss
d = dice_ml.Data(dataframe=x_train, continuous_features=['Age','Capital Gain', 'Capital Loss', 'Hours per week'], outcome_name='income') #nb capitol loss
m = dice_ml.Model(model=model, backend="sklearn")
exp = dice_ml.Dice(d, m, method='random')

In [38]:
print(np.__version__)

1.19.2


In [39]:
help(dice_ml)

Help on package dice_ml:

NAME
    dice_ml

PACKAGE CONTENTS
    counterfactual_explanations
    data
    data_interfaces (package)
    dice
    diverse_counterfactuals
    explainer_interfaces (package)
    model
    model_interfaces (package)
    utils (package)

FILE
    c:\users\eoind\anaconda3\envs\group_cf\lib\site-packages\dice_ml\__init__.py




### Materials --- Close to Decision Boundary

In [40]:
model.predict(X_test[6].reshape(1,-1))

array([1], dtype=int64)

In [41]:
#defining arguments
#features_to_vary = ['employment type', 'qualification', 'marital status', 'pob',
#                                                   'age', 'weekly hours', 'gender', 'race']

features_to_vary = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week']
random_seed = 0

In [42]:
positive_train_df = pd.DataFrame(np.array(positive_train_set), columns=['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week'])
negative_train_df = pd.DataFrame(np.array(negative_train_set), columns=['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week'])

In [43]:
def cfe_generator(instance):
#check if returned indices from exp_generator are full set or subset    
    if model.predict(X_test[instance].reshape(1,-1))[0] == False:

        NNs = explanation_generator((np.array(x_test[instance:instance+1])).reshape(1,-1), outcome = 'negative')[4][0]

        indices_cf_example = [NNs[0], NNs[1], NNs[2], NNs[3], NNs[4]]
                              
        e1 = exp.generate_counterfactuals(x_test[instance:instance+1], total_CFs=1, desired_class="opposite",
                                     features_to_vary=features_to_vary, random_seed=random_seed)

        e2 = exp.generate_counterfactuals(negative_train_df[indices_cf_example[0]:indices_cf_example[0]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)

        e3 = exp.generate_counterfactuals(negative_train_df[indices_cf_example[1]:indices_cf_example[1]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)

        e4 = exp.generate_counterfactuals(negative_train_df[indices_cf_example[2]:indices_cf_example[2]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)

        e5 = exp.generate_counterfactuals(negative_train_df[indices_cf_example[3]:indices_cf_example[3]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)
        
    elif model.predict(X_test[instance].reshape(1,-1))[0] == True:
        
        NNs = explanation_generator((np.array(x_test[instance:instance+1])).reshape(1,-1), outcome = 'positive')[4][0]

        indices_cf_example = [NNs[0], NNs[1], NNs[2], NNs[3], NNs[4]]
        
        e1 = exp.generate_counterfactuals(x_test[instance:instance+1], total_CFs=1, desired_class="opposite",
                                     features_to_vary=features_to_vary, random_seed=random_seed)

        e2 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[0]:indices_cf_example[0]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)

        e3 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[1]:indices_cf_example[1]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)

        e4 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[2]:indices_cf_example[2]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)

        e5 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[3]:indices_cf_example[3]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)
        
        e6 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[4]:indices_cf_example[4]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary, random_seed=random_seed)
        
        
    return e1,e2,e3,e4,e5
    
    #e1.visualize_as_dataframe(show_only_changes=True)

### Different Models for Robustness

(i) ANN
(ii) Linear - Model or Logistic Regression
(iii) Ensemble Models
(iv)


In [44]:
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [45]:
#mlp_model = make_pipeline(StandardScaler(), MLPClassifier(random_state=0))
#mlp_model.fit(X_train, y_train)
#mlp_yhat = mlp_model.predict(X_test)

In [46]:
#logistic_model = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))
#logistic_model.fit(X_train, y_train)
#logistic_yhat = logistic_model.predict(X_test)

In [47]:
from sklearn.metrics import accuracy_score

In [48]:
accuracy_score(y_test, yhat)

0.87421875

In [49]:
#accuracy_score(y_test, mlp_yhat)

In [50]:
#accuracy_score(y_test, logistic_yhat)

### Coverage Check

In [51]:
from scipy import stats

In [52]:
def group_cfe_generator(test_index):
    
    example = cfe_generator(test_index)
    
    cfes = []
    
    group_cfe = []
    
    for i in range(5):
        
        cfes.append(np.array(example[i].cf_examples_list[0].final_cfs_df.values[0]))
        
        
    for feature in range(12):
        
        feature_values = (np.array([cfes[i][feature] for i in range(5)]))
        
        if [stats.mode(feature_values)[1] >=3][0][0] == True:
            
            group_cfe.append(stats.mode(feature_values)[0][0])
            
        else:
            group_cfe.append(round(np.mean(feature_values.astype(np.float)) ,1))
                
    return np.array(group_cfe).astype(np.float)


In [53]:
boarderline_cases_ = boarderline_cases(threshold=0.85)

In [54]:
group_cfe_generator(4)

array([4.0000e+00, 4.0000e+00, 1.0000e+00, 6.0000e+00, 1.0000e+00,
       4.0000e+00, 0.0000e+00, 9.0000e+00, 2.0000e+01, 0.0000e+00,
       2.4114e+03, 2.5000e+01])

In [55]:
group_cfe_instances = []

for case in boarderline_cases_[0:9]: #ed edit full
    
    group_cfe_instances.append(group_cfe_generator(case))

In [56]:
np.array(group_cfe_instances).shape

(9, 12)

In [57]:
#gcfe_df = pd.DataFrame(group_cfe_instances, columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week'])

In [58]:
#test_df = pd.DataFrame(np.array([X_test[instance] for instance in boarderline_cases_]), columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week'])

In [59]:
#feature_changes = np.array([len(np.where(np.array(gcfe_df - test_df)[index] != 0)[0]) for index in range(len(boarderline_cases_))])

In [60]:
cat = 'Country'
idx = feature_names.index(cat)
np.array(category_map[idx])[:]

array(['?', 'British-Commonwealth', 'China', 'Euro_1', 'Euro_2',
       'Latin-America', 'Other', 'SE-Asia', 'South-America',
       'United-States', 'Yugoslavia'], dtype='<U20')

In [61]:
#boarderline_cases_[80]

In [62]:
#gcfe_df.head(134)

In [63]:
#cfe_generator(boarderline_cases_[40])[4].visualize_as_dataframe(show_only_changes=True)

In [64]:
#np.round(gcfe_df - test_df, 0).iloc[80:130]

### Debugging training instance class

In [65]:
([cfe_generator(boarderline_cases_[9])[instance].visualize_as_dataframe(show_only_changes=True) for instance in range(5)])

Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,5,0,7,0,2,1,0,38,0,1485,40,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,-,-,-,-,-,-,92726,-,-,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,4,0,7,0,2,1,9,33,0,0,40,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,-,-,-,-,-,-,45790,3356,-,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,3,0,4,0,2,1,0,41,0,0,40,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,-,-,-,-,-,-,45790,-,-,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,1,0,8,0,2,1,0,33,0,0,40,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,5,-,-,-,-,-,-,-,-,-,93,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,0,0,7,0,4,1,8,38,0,0,40,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,5,4,-,-,-,-,-,-,-,1


[None, None, None, None, None]

In [66]:
cfe_generator(boarderline_cases_[74])[0].visualize_as_dataframe()

Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,7,0,0,5,0,4,1,5,60,0,0,20,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,7,0,0,5,0,4,1,5,60,99751,3291,20,1


In [67]:
number_of_changes = []

for case in range(len(boarderline_cases_)):

  
    for instance in range(5):

            q_cf_pair = cfe_generator(boarderline_cases_[case])[instance]

            q = q_cf_pair.cf_examples_list[0].test_instance_df.values[0]
            cf = q_cf_pair.cf_examples_list[0].final_cfs_df.values[0]

            number_of_changes.append(len(list(np.where(q-cf !=0)[0])) -1) #subtract one for the income feature 

In [68]:
(np.array_split(number_of_changes, len(boarderline_cases_)))

[array([2, 1, 1, 2, 2]),
 array([2, 2, 2, 2, 1]),
 array([2, 1, 2, 2, 1]),
 array([2, 2, 2, 2, 2]),
 array([1, 2, 2, 2, 2]),
 array([1, 1, 2, 1, 2]),
 array([2, 2, 2, 2, 2]),
 array([2, 2, 2, 1, 2]),
 array([2, 2, 2, 2, 1]),
 array([1, 2, 1, 2, 2]),
 array([1, 2, 1, 2, 2]),
 array([2, 1, 2, 2, 1]),
 array([2, 2, 2, 1, 1]),
 array([2, 2, 2, 2, 2]),
 array([2, 2, 2, 2, 2]),
 array([2, 1, 2, 2, 2]),
 array([2, 2, 2, 2, 1]),
 array([2, 2, 2, 2, 2]),
 array([2, 2, 2, 2, 2]),
 array([2, 1, 1, 1, 2]),
 array([2, 1, 2, 2, 2]),
 array([2, 1, 2, 2, 1]),
 array([2, 2, 2, 2, 2]),
 array([1, 2, 2, 2, 1]),
 array([2, 2, 2, 1, 1]),
 array([2, 2, 2, 2, 1]),
 array([2, 1, 1, 1, 2]),
 array([1, 2, 2, 1, 2]),
 array([2, 2, 1, 1, 2]),
 array([2, 1, 2, 1, 1]),
 array([2, 2, 2, 2, 2]),
 array([2, 2, 1, 1, 2]),
 array([1, 1, 1, 2, 2]),
 array([1, 1, 2, 1, 1]),
 array([1, 1, 1, 2, 1]),
 array([2, 1, 1, 1, 2]),
 array([1, 2, 2, 2, 2]),
 array([1, 2, 2, 1, 2]),
 array([1, 2, 2, 2, 2]),
 array([2, 2, 2, 2, 2]),


In [69]:
two_change_cfes = np.where([((np.array_split(number_of_changes, len(boarderline_cases_))[instance])).sum() == 10 for instance in range(len(boarderline_cases_))])[0]

In [70]:
len(np.where([((np.array_split(number_of_changes, len(boarderline_cases_))[instance])).sum() == 10 for instance in range(len(boarderline_cases_))])[0])

155

In [71]:
Counter([y_test[boarderline_cases_[instance]] for instance in two_change_cfes])

Counter({1: 98, 0: 57})

In [72]:
two_change_cfes

array([  3,   6,  13,  14,  17,  18,  22,  30,  39,  40,  42,  43,  44,
        45,  53,  55,  72,  77,  80,  82,  90,  94, 100, 102, 106, 114,
       122, 123, 138, 139, 141, 142, 144, 145, 147, 151, 153, 154, 159,
       162, 166, 167, 170, 179, 181, 186, 187, 190, 191, 192, 196, 210,
       211, 223, 228, 230, 235, 249, 254, 259, 266, 274, 278, 283, 286,
       297, 300, 301, 303, 311, 312, 318, 325, 331, 340, 341, 345, 350,
       355, 356, 362, 366, 368, 372, 374, 379, 380, 388, 400, 404, 410,
       412, 417, 421, 430, 431, 434, 435, 437, 442, 446, 449, 451, 452,
       462, 463, 470, 475, 483, 493, 494, 504, 514, 516, 528, 541, 550,
       551, 552, 554, 555, 556, 560, 563, 564, 565, 574, 576, 578, 581,
       583, 586, 593, 597, 601, 604, 607, 612, 613, 614, 617, 622, 631,
       636, 637, 638, 640, 641, 646, 648, 655, 658, 661, 677, 685],
      dtype=int64)

In [73]:
case = 228

material = []
for instance in range(5):
    
    q_cf_pair = cfe_generator(boarderline_cases_[case])[instance]
    q = q_cf_pair.cf_examples_list[0].test_instance_df.values[0]
    cf = q_cf_pair.cf_examples_list[0].final_cfs_df.values[0]
    
    material.append(q)
    material.append(cf)

In [74]:
material_df = pd.DataFrame(material, columns=['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Income'])
material_df

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,Income
0,5,6,0,5,0,4,1,9,41,0,0,60,1
1,5,0,1,5,0,4,1,9,41,0,0,60,0
2,4,6,0,5,0,4,1,9,41,0,0,60,1
3,4,0,1,5,0,4,1,9,41,0,0,60,0
4,5,6,0,5,0,4,1,9,48,0,0,60,1
5,5,5,0,5,0,4,1,9,48,0,0,11,0
6,5,6,0,5,0,4,1,9,51,0,0,60,1
7,5,5,0,5,0,4,1,9,51,0,0,11,0
8,5,1,0,6,0,4,1,9,41,0,0,60,1
9,5,3,0,6,0,2,1,9,41,0,0,60,0


In [75]:
for column_no in range(n_categories):
    cat_vals = category_map[column_no]
    cat_codes = list(range(len(cat_vals)))
    cat_codes = [float(i) for i in cat_codes]
    col_name = material_df.iloc[:,column_no].name
    #print(col_name,cat_vals,cat_codes)
    material_df[col_name].replace(cat_codes, cat_vals,inplace = True)

In [76]:
material_df

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,Income
0,Self-emp-inc,Prof-School,Married,Professional,Husband,White,Male,United-States,41,0,0,60,1
1,Self-emp-inc,Associates,Never-Married,Professional,Husband,White,Male,United-States,41,0,0,60,0
2,Private,Prof-School,Married,Professional,Husband,White,Male,United-States,41,0,0,60,1
3,Private,Associates,Never-Married,Professional,Husband,White,Male,United-States,41,0,0,60,0
4,Self-emp-inc,Prof-School,Married,Professional,Husband,White,Male,United-States,48,0,0,60,1
5,Self-emp-inc,Masters,Married,Professional,Husband,White,Male,United-States,48,0,0,11,0
6,Self-emp-inc,Prof-School,Married,Professional,Husband,White,Male,United-States,51,0,0,60,1
7,Self-emp-inc,Masters,Married,Professional,Husband,White,Male,United-States,51,0,0,11,0
8,Self-emp-inc,Bachelors,Married,Sales,Husband,White,Male,United-States,41,0,0,60,1
9,Self-emp-inc,Dropout,Married,Sales,Husband,Black,Male,United-States,41,0,0,60,0


In [77]:
### balanced item set 

In [78]:
np.where(np.array([y_test[boarderline_cases_[instance]] for instance in two_change_cfes])  == 1)

(array([  0,   1,   3,   4,   5,   7,  11,  12,  13,  14,  15,  16,  17,
         18,  20,  22,  23,  24,  28,  29,  30,  31,  32,  33,  36,  38,
         39,  43,  44,  45,  46,  49,  50,  52,  54,  55,  56,  57,  58,
         60,  62,  66,  68,  69,  70,  71,  72,  73,  74,  78,  80,  82,
         83,  84,  85,  86,  88,  89,  90,  92,  93,  94,  97,  98,  99,
        100, 103, 104, 105, 106, 108, 109, 113, 114, 115, 116, 117, 118,
        121, 124, 126, 129, 130, 131, 132, 133, 134, 135, 136, 137, 140,
        144, 145, 146, 147, 150, 153, 154], dtype=int64),)

In [79]:
a_b_two = np.array([two_change_cfes[instance] for instance in np.where(np.array([y_test[boarderline_cases_[instance]] for instance in two_change_cfes])  == 1)[0]])
a_b_two
# above to below indices in boarderlinecases_ with 2 changes*

array([  3,   6,  14,  17,  18,  30,  43,  44,  45,  53,  55,  72,  77,
        80,  90, 100, 102, 106, 138, 139, 141, 142, 144, 145, 153, 159,
       162, 179, 181, 186, 187, 192, 196, 211, 228, 230, 235, 249, 254,
       266, 278, 300, 303, 311, 312, 318, 325, 331, 340, 355, 362, 368,
       372, 374, 379, 380, 400, 404, 410, 417, 421, 430, 435, 437, 442,
       446, 452, 462, 463, 470, 483, 493, 516, 528, 541, 550, 551, 552,
       556, 564, 574, 581, 583, 586, 593, 597, 601, 604, 607, 612, 617,
       637, 638, 640, 641, 655, 677, 685], dtype=int64)

In [80]:
b_a_two = np.array([two_change_cfes[instance] for instance in np.where(np.array([y_test[boarderline_cases_[instance]] for instance in two_change_cfes])  == 0)[0]])
b_a_two
#below to above indices in boarderlinecases_ with 2 changes*

array([ 13,  22,  39,  40,  42,  82,  94, 114, 122, 123, 147, 151, 154,
       166, 167, 170, 190, 191, 210, 223, 259, 274, 283, 286, 297, 301,
       341, 345, 350, 356, 366, 388, 412, 431, 434, 449, 451, 475, 494,
       504, 514, 554, 555, 560, 563, 565, 576, 578, 613, 614, 622, 631,
       636, 646, 648, 658, 661], dtype=int64)

In [84]:
def material_df_generator(case):
    
    material = []
    
    for instance in range(5):
    
        q_cf_pair = cfe_generator(boarderline_cases_[case])[instance]
        q = q_cf_pair.cf_examples_list[0].test_instance_df.values[0]
        cf = q_cf_pair.cf_examples_list[0].final_cfs_df.values[0]

        material.append(q)
        material.append(cf)
    
    material_df = pd.DataFrame(material, columns=['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Income'])
    
    for column_no in range(n_categories):
        cat_vals = category_map[column_no]
        cat_codes = list(range(len(cat_vals)))
        cat_codes = [float(i) for i in cat_codes]
        col_name = material_df.iloc[:,column_no].name
        #print(col_name,cat_vals,cat_codes)
        material_df[col_name].replace(cat_codes, cat_vals,inplace = True)
    
    return material_df

In [85]:
([cfe_generator(boarderline_cases_[661])[instance].visualize_as_dataframe(show_only_changes=True) for instance in range(5)])

Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,0,0,2,0,4,1,9,43,0,0,60,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,7,-,-,-,-,-,-,-,-,-,2995,-,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,0,0,2,0,4,1,9,40,0,0,60,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,-,-,1,-,-,-,30105,-,-,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,0,0,2,0,4,1,9,43,0,0,99,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,-,1,-,-,-,-,22571,-,-,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,0,0,2,0,4,1,9,60,0,0,60,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,2,-,-,-,-,-,-,-,-,1344,-,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,4,4,0,2,0,4,1,9,43,0,0,60,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,4,-,-,-,-,-,39920,-,-,1


[None, None, None, None, None]

In [86]:
material_df_generator(191)

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,Income
0,Private,High School grad,Married,Sales,Wife,White,Female,Euro_2,58,0,0,40,0
1,Private,Doctorate,Married,Sales,Wife,White,Female,Euro_2,78,0,0,40,1
2,Private,High School grad,Married,Sales,Wife,White,Female,United-States,26,0,0,40,0
3,Private,High School grad,Married,Sales,Wife,White,Female,United-States,26,0,3781,85,1
4,Private,High School grad,Married,Sales,Wife,White,Female,United-States,58,0,0,38,0
5,Private,High School grad,Married,Sales,Unmarried,White,Female,United-States,58,73255,0,38,1
6,Private,High School grad,Married,Sales,Wife,White,Female,United-States,28,0,0,40,0
7,Private,High School grad,Married,Sales,Wife,White,Female,United-States,51,90853,0,40,1
8,Private,High School grad,Married,Sales,Wife,White,Female,United-States,44,0,0,40,0
9,Private,Associates,Married,Sales,Wife,White,Female,United-States,44,78709,0,40,1


In [87]:
case = 475

material = []
for instance in range(5):
    
    q_cf_pair = cfe_generator(boarderline_cases_[case])[instance]
    q = q_cf_pair.cf_examples_list[0].test_instance_df.values[0]
    cf = q_cf_pair.cf_examples_list[0].final_cfs_df.values[0]
    
    material.append(q)
    material.append(cf)

In [88]:
material_df = pd.DataFrame(material, columns=['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Income'])
material_df_og = material_df
material_df = material_df.diff(1)
material_df = material_df.iloc[1::2]  # odd

In [89]:
material_df

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,Income
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,58249.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2903.0,0.0,1.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2903.0,0.0,1.0
7,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3674.0,0.0,1.0
9,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,39920.0,0.0,0.0,1.0


In [90]:
material_df = material_df.iloc[: , :12]

In [91]:
(material_df.astype(bool).sum(axis=0))

Workclass         0
Education         1
Marital Status    0
Occupation        3
Relationship      0
Race              0
Sex               0
Country           1
Age               0
Capital Gain      2
Capital Loss      3
Hours per week    0
dtype: int64

In [92]:
(material_df.astype(bool).sum(axis=0)).argmax()
    

3

In [93]:
column_change = (np.array(material_df.astype(bool).sum(axis=0)).argsort())[::-1] #indices with most changes
number_of_changes = np.sort(np.array(material_df.astype(bool).sum(axis=0)))[::-1] #number of feature changes

In [94]:
material_df.columns[column_change[0]], material_df.columns[column_change[1]]

('Capital Loss', 'Occupation')

In [95]:
number_of_changes

array([3, 3, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [96]:
material_df_og

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,Income
0,4,4,0,2,0,4,1,9,37,3464,0,50,0
1,4,4,0,2,0,4,1,7,37,61713,0,50,1
2,4,4,0,2,0,4,1,9,37,0,0,50,0
3,4,4,0,3,0,4,1,9,37,0,2903,50,1
4,4,4,0,2,0,4,1,9,37,0,0,52,0
5,4,4,0,3,0,4,1,9,37,0,2903,52,1
6,4,4,0,2,0,4,1,9,30,0,0,50,0
7,4,2,0,2,0,4,1,9,30,0,3674,50,1
8,4,4,0,2,0,4,1,9,43,0,0,50,0
9,4,4,0,4,0,4,1,9,43,39920,0,50,1


In [97]:
positive_train_set[:,1]

array([5, 1, 1, ..., 2, 1, 4])

In [98]:
def GCFE_features(case):
    
    case = case
    
    case_label = y_test[boarderline_cases_[case]]
    
    if case_label == 0:
        cf_label = 1
        
    else:
        cf_label = 0
    
    
    #choosing the sampling set for later on in the GCFE CBR Approach
    if y_test[boarderline_cases_[case]] == 0:
        sampling_set = positive_train_set
        
    else:
        sampling_set = negative_train_set
    
    # how many candidate gcfes to sample from for each material
    sample_size_gcfe_pairs = 50
    
    material = []
    
    for instance in range(5):
    
        q_cf_pair = cfe_generator(boarderline_cases_[case])[instance]
        q = q_cf_pair.cf_examples_list[0].test_instance_df.values[0]
        cf = q_cf_pair.cf_examples_list[0].final_cfs_df.values[0]

        material.append(q)
        material.append(cf)
    
    material_df = pd.DataFrame(material, columns=['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Income'])

    
    query_df = material_df.iloc[0::2]
    query_df = query_df.iloc[: , :12]
    
    material_df = material_df.diff(1)
    material_df = material_df.iloc[1::2]  # odd
    material_df = material_df.iloc[: , :12]
    column_change = (np.array(material_df.astype(bool).sum(axis=0)).argsort())[::-1] #indices with most changes
    number_of_changes = np.sort(np.array(material_df.astype(bool).sum(axis=0)))[::-1] #number of feature changes
    
    #finding the direction of the feature change if it is numeric ordinal or continious
    feature_directions = []
    gcfe_values = []
    for column_changed in [0,1]:
        feature = column_change[column_changed]
        if column_change[column_changed] in [8,9,10,11,12]:
            
            #[1,8,9,10,11,12]

            values = material_df[material_df.columns[column_change[column_changed]]].values
            values = values[values!=0]

            if sum(values > 0) > sum(values < 0):
                feature_direction = 'increase'
                
                #do stuff here 
                ##lets work in the random sampling;
                
                vals = sampling_set[:,feature:feature+1].flatten()[sampling_set[:,feature:feature+1].flatten() > query_df[query_df.columns[feature]].values.min()] # GW max?
                gcfe_values.append(np.random.RandomState(0).choice(vals, sample_size_gcfe_pairs))
            
            elif sum(values > 0) < sum(values < 0):
                feature_direction = 'decrease'
                       
            
                # do stuff here
                vals = sampling_set[:,feature:feature+1].flatten()[sampling_set[:,feature:feature+1].flatten() < query_df[query_df.columns[feature]].values.max()]
                gcfe_values.append(np.random.RandomState(0).choice(vals, sample_size_gcfe_pairs))

            elif sum(values > 0) == sum(values < 0):
                feature_direction = 'inconclusive'
                
                vals = sampling_set[:,feature:feature+1].flatten()
                gcfe_values.append(np.random.RandomState(0).choice(vals, sample_size_gcfe_pairs))
                #do stuff here 

        else:
            feature_direction = 'inconclusive'
            vals = sampling_set[:,feature:feature+1].flatten()
            gcfe_values.append(np.random.RandomState(0).choice(vals, sample_size_gcfe_pairs))
        
        feature_directions.append(feature_direction)
        
    
    
    
    ### Now lets check coverage of the GCFE
    
    gcfe_values = np.vstack((np.array(gcfe_values)[0], np.array(gcfe_values)[1])).T
    
    gcfe_values_coverage = []
    for candidate in gcfe_values:
        
        query_df.values[:,column_change[0]] = candidate[0] 
        query_df.values[:,column_change[1]] = candidate[1]
        
        predictions = model.predict(query_df.values)
        if all(prediction == cf_label for prediction in predictions):
            gcfe_values_coverage.append(candidate)
            #break use break if we only want the first one
    
    return (material_df.columns[column_change[0]], material_df.columns[column_change[1]]), column_change, number_of_changes, feature_directions, gcfe_values, gcfe_values_coverage

In [99]:
#positive_train_set[:,11:11+1].flatten()[positive_train_set[:,11:11+1].flatten() !=45]

In [100]:
# when we have sets of the form [[,], [,], [,], [,]]

In [101]:
GCFE_features(196)

(('Hours per week', 'Sex'),
 array([11,  6,  2,  9, 10,  8,  7,  5,  4,  3,  1,  0], dtype=int64),
 array([5, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 ['decrease', 'inconclusive'],
 array([[38,  1],
        [33,  1],
        [36,  1],
        [12,  0],
        [35,  0],
        [16,  0],
        [32,  1],
        [25,  1],
        [15,  1],
        [35,  1],
        [25,  0],
        [33,  0],
        [17,  1],
        [20,  0],
        [25,  1],
        [38,  0],
        [35,  0],
        [15,  1],
        [20,  1],
        [20,  1],
        [15,  0],
        [35,  0],
        [20,  1],
        [30,  1],
        [28,  1],
        [22,  1],
        [12,  0],
        [36,  0],
        [ 8,  1],
        [30,  0],
        [38,  1],
        [30,  1],
        [21,  1],
        [35,  1],
        [30,  1],
        [35,  1],
        [35,  0],
        [38,  0],
        [20,  1],
        [ 8,  0],
        [20,  0],
        [25,  0],
        [38,  0],
        [38,  0],
        [20,  1],
 

In [102]:
# Some aspects regarding changes

#(i) feature ties
#(ii) Education & Categorical Features
#(iii) Choosing a candidate from the random set

In [103]:
from sklearn_extra.cluster import KMedoids

In [104]:
item_set = GCFE_features(551)
kmedoids = KMedoids(n_clusters=1, random_state=0).fit(item_set[5])
item_set, kmedoids.cluster_centers_

((('Capital Loss', 'Age'),
  array([10,  8,  3, 11,  6,  2,  0,  9,  7,  5,  4,  1], dtype=int64),
  array([2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0], dtype=int64),
  ['increase', 'inconclusive'],
  array([[1590,   24],
         [1876,   50],
         [ 213,   33],
         [1740,   27],
         [1504,   59],
         [2051,   28],
         [1602,   28],
         [1672,   40],
         [1740,   18],
         [1719,   24],
         [2129,   74],
         [2206,   21],
         [2258,   40],
         [2002,   63],
         [1672,   49],
         [1669,   46],
         [1719,   42],
         [2002,   38],
         [1590,   58],
         [1726,   29],
         [1258,   19],
         [1741,   24],
         [1485,   26],
         [1648,   57],
         [1669,   31],
         [1726,   36],
         [2179,   36],
         [2001,   45],
         [2080,   27],
         [1876,   29],
         [1974,   80],
         [1669,   48],
         [2205,   43],
         [1408,   23],
         [1602,   72],
    

In [105]:
category_map

{0: ['?',
  'Federal-gov',
  'Local-gov',
  'Never-worked',
  'Private',
  'Self-emp-inc',
  'Self-emp-not-inc',
  'State-gov',
  'Without-pay'],
 1: ['Associates',
  'Bachelors',
  'Doctorate',
  'Dropout',
  'High School grad',
  'Masters',
  'Prof-School'],
 2: ['Married', 'Never-Married', 'Separated', 'Widowed'],
 3: ['?',
  'Admin',
  'Blue-Collar',
  'Military',
  'Other',
  'Professional',
  'Sales',
  'Service',
  'White-Collar'],
 4: ['Husband',
  'Not-in-family',
  'Other-relative',
  'Own-child',
  'Unmarried',
  'Wife'],
 5: ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'],
 6: ['Female', 'Male'],
 7: ['?',
  'British-Commonwealth',
  'China',
  'Euro_1',
  'Euro_2',
  'Latin-America',
  'Other',
  'SE-Asia',
  'South-America',
  'United-States',
  'Yugoslavia']}

In [106]:
boarderline_cases_

array([   0,    5,    6,   10,   16,   23,   24,   28,   33,   35,   39,
         45,   46,   47,   51,   52,   59,   62,   66,   70,   73,   74,
         78,   82,   84,   97,  101,  103,  105,  109,  113,  115,  117,
        118,  123,  126,  127,  128,  129,  135,  143,  149,  152,  154,
        155,  158,  161,  165,  168,  175,  179,  183,  189,  191,  196,
        197,  198,  216,  217,  218,  229,  230,  232,  234,  240,  245,
        248,  249,  252,  255,  258,  260,  261,  263,  265,  266,  267,
        271,  274,  277,  281,  288,  295,  299,  304,  308,  312,  319,
        333,  348,  352,  354,  355,  356,  358,  360,  361,  369,  370,
        372,  380,  388,  392,  393,  401,  404,  407,  409,  410,  416,
        417,  420,  427,  429,  431,  445,  446,  447,  449,  460,  469,
        474,  478,  481,  484,  486,  487,  489,  496,  497,  500,  502,
        504,  506,  510,  511,  513,  516,  517,  519,  520,  521,  528,
        529,  530,  531,  532,  534,  535,  536,  5

#### Selecting feature changes for the purpose of balancing materials

In [107]:
#['Age', 'Hours per week', 'Occupation', 'Sex', 'Workclass'] 

In [108]:
def cfe_generator_balance(instance):
    
    features_to_vary_balance = ['Age', 'Hours per week', 'Occupation', 'Sex', 'Workclass'] 
#check if returned indices from exp_generator are full set or subset    
    if model.predict(X_test[instance].reshape(1,-1))[0] == False:

        NNs = explanation_generator((np.array(x_test[instance:instance+1])).reshape(1,-1), outcome = 'negative')[4][0]

        indices_cf_example = [NNs[0], NNs[1], NNs[2], NNs[3], NNs[4]]
                              
        e1 = exp.generate_counterfactuals(x_test[instance:instance+1], total_CFs=1, desired_class="opposite",
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)

        e2 = exp.generate_counterfactuals(negative_train_df[indices_cf_example[0]:indices_cf_example[0]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)

        e3 = exp.generate_counterfactuals(negative_train_df[indices_cf_example[1]:indices_cf_example[1]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)

        e4 = exp.generate_counterfactuals(negative_train_df[indices_cf_example[2]:indices_cf_example[2]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)

        e5 = exp.generate_counterfactuals(negative_train_df[indices_cf_example[3]:indices_cf_example[3]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)
        
    elif model.predict(X_test[instance].reshape(1,-1))[0] == True:
        
        NNs = explanation_generator((np.array(x_test[instance:instance+1])).reshape(1,-1), outcome = 'positive')[4][0]

        indices_cf_example = [NNs[0], NNs[1], NNs[2], NNs[3], NNs[4]]
        
        e1 = exp.generate_counterfactuals(x_test[instance:instance+1], total_CFs=1, desired_class="opposite",
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)

        e2 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[0]:indices_cf_example[0]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)

        e3 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[1]:indices_cf_example[1]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)

        e4 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[2]:indices_cf_example[2]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)

        e5 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[3]:indices_cf_example[3]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)
        
        e6 = exp.generate_counterfactuals(positive_train_df[indices_cf_example[4]:indices_cf_example[4]+1], total_CFs=1, desired_class="opposite", 
                                     features_to_vary=features_to_vary_balance, random_seed=random_seed)
        
        
    return e1,e2,e3,e4,e5
    
    #e1.visualize_as_dataframe(show_only_changes=True)

In [109]:
([cfe_generator_balance(boarderline_cases_[39])[instance].visualize_as_dataframe(show_only_changes=True) for instance in range(5)])

Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,1,4,0,1,0,4,1,9,36,0,0,35,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,4,-,-,-,-,-,-,-,60,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,1,4,0,1,0,4,1,9,36,0,0,70,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,8,-,-,-,-,56,-,-,-,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,1,4,0,1,0,4,1,9,50,0,0,48,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,6,-,-,-,-,-,-,-,78,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,1,4,0,1,0,4,1,9,47,0,0,40,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,6,-,-,-,-,-,-,-,89,1


Query instance (original outcome : 0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,1,4,0,1,0,4,1,9,38,0,0,45,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,income
0,-,-,-,8,-,-,-,-,78,-,-,-,1


[None, None, None, None, None]

In [110]:
def GCFE_features_balance(case):
    
    case = case
    
    case_label = y_test[boarderline_cases_[case]]
    
    if case_label == 0:
        cf_label = 1
        
    else:
        cf_label = 0
    
    
    #choosing the sampling set for later on in the GCFE CBR Approach
    if y_test[boarderline_cases_[case]] == 0:
        sampling_set = positive_train_set
        
    else:
        sampling_set = negative_train_set
    
    # how many candidate gcfes to sample from for each material
    sample_size_gcfe_pairs = 50
    
    material = []
    
    for instance in range(5):
    
        q_cf_pair = cfe_generator_balance(boarderline_cases_[case])[instance]
        q = q_cf_pair.cf_examples_list[0].test_instance_df.values[0]
        cf = q_cf_pair.cf_examples_list[0].final_cfs_df.values[0]

        material.append(q)
        material.append(cf)
    
    material_df = pd.DataFrame(material, columns=['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country', 'Age', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Income'])

    
    query_df = material_df.iloc[0::2]
    query_df = query_df.iloc[: , :12]
    
    material_df = material_df.diff(1)
    material_df = material_df.iloc[1::2]  # odd
    material_df = material_df.iloc[: , :12]
    column_change = (np.array(material_df.astype(bool).sum(axis=0)).argsort())[::-1] #indices with most changes
    number_of_changes = np.sort(np.array(material_df.astype(bool).sum(axis=0)))[::-1] #number of feature changes
    
    #finding the direction of the feature change if it is numeric ordinal or continious
    feature_directions = []
    gcfe_values = []
    for column_changed in [0,1]:
        feature = column_change[column_changed]
        if column_change[column_changed] in [8,9,10,11,12]:
            
            #[1,8,9,10,11,12]

            values = material_df[material_df.columns[column_change[column_changed]]].values
            values = values[values!=0]

            if sum(values > 0) > sum(values < 0):
                feature_direction = 'increase'
                
                #do stuff here 
                ##lets work in the random sampling;
                
                vals = sampling_set[:,feature:feature+1].flatten()[sampling_set[:,feature:feature+1].flatten() > query_df[query_df.columns[feature]].values.min()] # GW max?
                gcfe_values.append(np.random.RandomState(0).choice(vals, sample_size_gcfe_pairs))
            
            elif sum(values > 0) < sum(values < 0):
                feature_direction = 'decrease'
                       
            
                # do stuff here
                vals = sampling_set[:,feature:feature+1].flatten()[sampling_set[:,feature:feature+1].flatten() < query_df[query_df.columns[feature]].values.max()]
                gcfe_values.append(np.random.RandomState(0).choice(vals, sample_size_gcfe_pairs))

            elif sum(values > 0) == sum(values < 0):
                feature_direction = 'inconclusive'
                
                vals = sampling_set[:,feature:feature+1].flatten()
                gcfe_values.append(np.random.RandomState(0).choice(vals, sample_size_gcfe_pairs))
                #do stuff here 

        else:
            feature_direction = 'inconclusive'
            vals = sampling_set[:,feature:feature+1].flatten()
            gcfe_values.append(np.random.RandomState(0).choice(vals, sample_size_gcfe_pairs))
        
        feature_directions.append(feature_direction)
        
    
    
    
    ### Now lets check coverage of the GCFE
    
    gcfe_values = np.vstack((np.array(gcfe_values)[0], np.array(gcfe_values)[1])).T
    
    gcfe_values_coverage = []
    for candidate in gcfe_values:
        
        query_df.values[:,column_change[0]] = candidate[0] 
        query_df.values[:,column_change[1]] = candidate[1]
        
        predictions = model.predict(query_df.values)
        if all(prediction == cf_label for prediction in predictions):
            gcfe_values_coverage.append(candidate)
            #break use break if we only want the first one
    
    return (material_df.columns[column_change[0]], material_df.columns[column_change[1]]), column_change, number_of_changes, feature_directions, gcfe_values, gcfe_values_coverage,

In [111]:
GCFE_features_balance(534)

(('Hours per week', 'Age'),
 array([11,  8,  3, 10,  9,  7,  6,  5,  4,  2,  1,  0], dtype=int64),
 array([4, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 ['increase', 'increase'],
 array([[45, 53],
        [99, 46],
        [50, 49],
        [50, 49],
        [60, 46],
        [50, 68],
        [60, 53],
        [70, 38],
        [45, 39],
        [67, 38],
        [50, 37],
        [60, 46],
        [60, 41],
        [50, 44],
        [50, 50],
        [45, 41],
        [45, 43],
        [60, 57],
        [55, 46],
        [50, 59],
        [45, 46],
        [60, 53],
        [60, 30],
        [45, 42],
        [45, 43],
        [55, 54],
        [50, 33],
        [60, 33],
        [60, 41],
        [45, 55],
        [50, 62],
        [50, 39],
        [50, 57],
        [55, 37],
        [55, 39],
        [45, 63],
        [75, 46],
        [50, 40],
        [50, 51],
        [55, 55],
        [45, 42],
        [44, 60],
        [50, 38],
        [50, 35],
        [45, 39],
     

In [113]:
item_set = GCFE_features_balance(534)
kmedoids = KMedoids(n_clusters=1, random_state=0).fit(item_set[5])
item_set, kmedoids.cluster_centers_

((('Hours per week', 'Age'),
  array([11,  8,  3, 10,  9,  7,  6,  5,  4,  2,  1,  0], dtype=int64),
  array([4, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
  ['increase', 'increase'],
  array([[45, 53],
         [99, 46],
         [50, 49],
         [50, 49],
         [60, 46],
         [50, 68],
         [60, 53],
         [70, 38],
         [45, 39],
         [67, 38],
         [50, 37],
         [60, 46],
         [60, 41],
         [50, 44],
         [50, 50],
         [45, 41],
         [45, 43],
         [60, 57],
         [55, 46],
         [50, 59],
         [45, 46],
         [60, 53],
         [60, 30],
         [45, 42],
         [45, 43],
         [55, 54],
         [50, 33],
         [60, 33],
         [60, 41],
         [45, 55],
         [50, 62],
         [50, 39],
         [50, 57],
         [55, 37],
         [55, 39],
         [45, 63],
         [75, 46],
         [50, 40],
         [50, 51],
         [55, 55],
         [45, 42],
         [44, 60],
         [50,

In [114]:
material_df_generator(534)

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Age,Capital Gain,Capital Loss,Hours per week,Income
0,Private,Bachelors,Married,Blue-Collar,Husband,White,Male,British-Commonwealth,33,0,0,40,0
1,Private,Bachelors,Married,Blue-Collar,Wife,White,Male,British-Commonwealth,33,0,2956,40,1
2,Private,Bachelors,Married,Blue-Collar,Husband,White,Male,United-States,33,0,0,40,0
3,Private,Associates,Married,Blue-Collar,Husband,White,Male,United-States,33,52548,0,40,1
4,Private,Bachelors,Married,Blue-Collar,Husband,White,Male,British-Commonwealth,29,0,0,40,0
5,Private,Bachelors,Married,Blue-Collar,Wife,White,Male,British-Commonwealth,29,0,2956,40,1
6,Private,Bachelors,Married,Blue-Collar,Husband,Other,Male,South-America,33,0,0,40,0
7,Private,Bachelors,Married,Blue-Collar,Husband,Other,Male,South-America,33,52620,0,40,1
8,Private,Bachelors,Married,Blue-Collar,Husband,White,Male,United-States,37,0,0,40,0
9,Private,Associates,Married,Blue-Collar,Husband,White,Male,United-States,37,44069,0,40,1
