In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split

import pandas as pd
from jmetal.algorithm.multiobjective.nsgaii import NSGAII
from jmetal.algorithm.multiobjective.nsgaiii import NSGAIII, UniformReferenceDirectionFactory
from jmetal.operator import PolynomialMutation, SBXCrossover
from jmetal.util.termination_criterion import StoppingByEvaluations
from jmetal.util.solution import get_non_dominated_solutions
from problem import CounterfactualConsensus
import pickle
# import tensorflow as tf

from jmetal.lab.visualization import Plot
from jmetal.util.observer import ProgressBarObserver
# from multiprocessing import Pool
from disagreement import Disagreement
from results_manager import ResultsManager

### Load the Multi-layer Perceptron models from Drive


In [2]:
with open('credit_risk/repeat_mlp_classifiers', 'rb') as f:
  repeat_mlp_classifiers = pickle.load(f)
  f.close()

# explainers
with open('credit_risk/repeat_discern_mlp_counterfactuals', 'rb') as f:
    repeat_discern_mlp_counterfactuals = pickle.load(f)
    f.close()
    
with open('credit_risk/repeat_dice_mlp_counterfactuals', 'rb') as f:
    repeat_dice_mlp_counterfactuals = pickle.load(f)
    f.close()

with open('credit_risk/repeat_nice_mlp_counterfactuals', 'rb') as f:
    repeat_nice_mlp_counterfactuals = pickle.load(f)
    f.close()
    
with open('credit_risk/repeat_wachter_mlp_counterfactuals', 'rb') as f:
    repeat_wachter_mlp_counterfactuals = pickle.load(f)
    f.close()
    
with open('credit_risk/repeat_mlp_predictions', 'rb') as f:
    repeat_mlp_predictions = pickle.load(f)
    f.close()



Trying to unpickle estimator LabelBinarizer from version 1.2.2 when using version 1.3.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator MLPClassifier from version 1.2.2 when using version 1.3.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



In [3]:
# Set seed for reproducability
seed = 42
np.random.seed(seed)
random.seed(seed)
# tf.random.set_seed(seed)

In [4]:
# Number of unique random numbers you want to generate
num_unique_numbers = 20

# Range of random numbers (you can adjust the range as needed)
lower_bound = 1
upper_bound = 100

# We only want unique splits
unique_numbers_set = set()

# Loop until we get 20 unique data splits
while len(unique_numbers_set) < num_unique_numbers:
    # Generate a random number within the specified range
    random_number = np.random.randint(lower_bound, upper_bound + 1)
    unique_numbers_set.add(random_number)

# Convert the set of unique numbers to a Python list
repeats = list(unique_numbers_set)

# Show seeds to create the new splits
print(repeats)

[2, 3, 15, 21, 22, 24, 30, 38, 52, 53, 60, 61, 64, 72, 75, 83, 87, 88, 93, 100]


In [5]:
df = pd.read_csv("data/loan_data.csv")
df

Unnamed: 0,loan_amnt,total_pymnt,total_rec_int,term,int_rate,installment,home_ownership,annual_inc,verification_status,purpose,loan_status
0,0.282051,0.070368,0.080069,0.0,0.381868,0.266072,1.0,0.046525,0.0,0.3,0
1,0.158974,0.043524,0.134878,0.0,0.690345,0.170473,1.0,0.095348,0.0,0.3,0
2,0.128846,0.032908,0.029482,0.0,0.219388,0.114056,0.0,0.059449,0.0,0.2,0
3,0.102564,0.110758,0.001140,0.0,0.104003,0.086058,1.0,0.059449,1.0,0.1,1
4,0.743590,0.721722,0.341581,0.0,0.381868,0.698949,1.0,0.282022,0.0,0.3,1
...,...,...,...,...,...,...,...,...,...,...,...
1995,0.358974,0.072260,0.090625,0.0,0.104003,0.303200,1.0,0.092476,1.0,0.2,0
1996,0.358974,0.363095,0.187983,1.0,0.286499,0.212053,1.0,0.131246,0.0,0.2,1
1997,0.282051,0.287175,0.130538,1.0,0.286499,0.165141,1.0,0.095616,0.0,0.3,1
1998,0.102564,0.113341,0.017602,0.0,0.381868,0.097734,1.0,0.195864,0.0,0.0,1


In [6]:
# Split the dataset into features (X) and target (y)
X = df.drop(columns=["loan_status"])
y = df["loan_status"]

In [7]:
# Initialise our dataset splits
repeat_train_data, repeat_test_data = [], []
repeat_y_train, repeat_y_test = [], []

# Create the repeated splits
for repeat_seed in repeats:

  # Split the dataset into training and test sets
  train_data, test_data, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=repeat_seed, stratify=y)

  # Save the splits
  repeat_train_data.append(train_data)
  repeat_test_data.append(test_data)
  repeat_y_train.append(y_train)
  repeat_y_test.append(y_test)

In [8]:
# Pregnancies, skin thickness, diabetes pedigree function and age cannot be changed
immutable_features = ['home_ownership']
immutable_feature_idxs = [6]
idxs_to_vary = [0,1,2,4,5,7,8,9]
features_to_vary = ['loan_amnt', 'total_pymnt', 'total_rec_int', 'term','int_rate', 'installment', 'annual_inc', 'verification_status', 'purpose']

# Feature names
feature_names = df.columns[:-1]
feature_idxs = np.arange(len(feature_names))

# Get the categorical features from the data frame
categorical_feature_idxs = [3,6,8,9]
categorical_features = ['term', 'home_ownership', 'verification_status', 'purpose']

# All features before late_coursework are continuous
continuous_feature_idxs = [0,1,2,4,5,7]
continuous_features = ['loan_amnt', 'total_pymnt', 'total_rec_int', 'int_rate', 'installment', 'annual_inc']

# Target variable
class_name = "loan_status"
desired_idx = 0 # 0 indicates that the user likely won't default

In [9]:
# Initialise the lists to hold the positive and negative instances for each repeat
# repeat_rf_positive_instances, repeat_rf_negative_instances = [], []
repeat_mlp_positive_instances, repeat_mlp_negative_instances = [], []

# Initialise the lists to hold the positive and negative predictions for each repeat
# repeat_rf_positive_predictions, repeat_rf_negative_predictions = [], []
repeat_mlp_positive_predictions, repeat_mlp_negative_predictions = [], []

# Get the positive and negative instances and predictions for each repetition
for repetition in range(len(repeats)):

  # Initialise the indices that will be used to filter the dataset
  # rf_positive_idxs, rf_negative_idxs = [], []
  mlp_positive_idxs, mlp_negative_idxs = [], []

  # Find the positive and negative cases
  for i, instance in enumerate(repeat_test_data[repetition].iterrows()):

    # Filter our misclassified samples from the Random Forest data
    # if(repeat_rf_predictions[repetition][i] == repeat_y_train[repetition].to_numpy()[i]):
    #   if(repeat_rf_predictions[repetition][i] == desired_idx):
    #     rf_positive_idxs.append(i)
    #   else:
    #     rf_negative_idxs.append(i)

    # Filter our misclassified samples from the Multi-layer Perceptron data
    if(repeat_mlp_predictions[repetition][i] == repeat_y_train[repetition].to_numpy()[i]):
      if(repeat_mlp_predictions[repetition][i] == desired_idx):
        mlp_positive_idxs.append(i)
      else:
        mlp_negative_idxs.append(i)

  # Filter out the positive instances, keep the negative
  # rf_negative_instances = repeat_test_data[repetition].iloc[rf_negative_idxs]
  mlp_negative_instances = repeat_test_data[repetition].iloc[mlp_negative_idxs]

  # Filter out the negative instances, keep the positive
  # rf_positive_instances = repeat_test_data[repetition].iloc[rf_negative_idxs]
  mlp_positive_instances = repeat_test_data[repetition].iloc[mlp_positive_idxs]

  # Since all negative predictions should be 0, we can use np.zeros()
  # rf_negative_predictions = np.zeros(len(rf_negative_idxs))
  mlp_negative_predictions = np.zeros(len(mlp_negative_idxs))

  # Since all desired predictions should be 1, we can use np.ones()
  # rf_positive_predictions = np.ones(len(rf_positive_idxs))
  mlp_positive_predictions = np.ones(len(mlp_positive_idxs))

  # # Store the Random Forest positive and negative instances and predictions
  # repeat_rf_positive_instances.append(rf_positive_instances)
  # repeat_rf_negative_instances.append(rf_negative_instances)
  # repeat_rf_positive_predictions.append(rf_positive_predictions)
  # repeat_rf_negative_predictions.append(rf_negative_predictions)

  # Store the Multi-layer Perceptron positive and negative instances and predictions
  repeat_mlp_positive_instances.append(mlp_positive_instances)
  repeat_mlp_negative_instances.append(mlp_negative_instances)
  repeat_mlp_positive_predictions.append(mlp_positive_predictions)
  repeat_mlp_negative_predictions.append(mlp_negative_predictions)

  # Print the filtered set sizes
  # print(f'Number of Random Forest Positive Cases: {len(rf_positive_instances)}, Negative Cases: {len(rf_negative_instances)}')
  print(f'Number of Multi-layer Perceptron Positive Cases: {len(mlp_positive_instances)}, Negative Cases: {len(mlp_negative_instances)}\n')

Number of Multi-layer Perceptron Positive Cases: 163, Negative Cases: 150

Number of Multi-layer Perceptron Positive Cases: 147, Negative Cases: 174

Number of Multi-layer Perceptron Positive Cases: 155, Negative Cases: 158

Number of Multi-layer Perceptron Positive Cases: 159, Negative Cases: 156

Number of Multi-layer Perceptron Positive Cases: 144, Negative Cases: 148

Number of Multi-layer Perceptron Positive Cases: 152, Negative Cases: 147

Number of Multi-layer Perceptron Positive Cases: 150, Negative Cases: 164

Number of Multi-layer Perceptron Positive Cases: 151, Negative Cases: 159

Number of Multi-layer Perceptron Positive Cases: 162, Negative Cases: 148

Number of Multi-layer Perceptron Positive Cases: 144, Negative Cases: 153

Number of Multi-layer Perceptron Positive Cases: 145, Negative Cases: 156

Number of Multi-layer Perceptron Positive Cases: 149, Negative Cases: 149

Number of Multi-layer Perceptron Positive Cases: 161, Negative Cases: 159

Number of Multi-layer Per

### Selection of Counterfactual Based On Disagreement

In [10]:
repetition = 0
predict_fn = lambda instance: repeat_mlp_classifiers[repetition].predict([instance])[0]
predict_proba_fn = lambda instance: repeat_mlp_classifiers[repetition].predict_proba([instance])[0]
class_labels = repeat_y_train[repetition]
data = repeat_train_data[repetition]#.to_numpy() # Switched off to_numpy() because I used a pandas df to get  
mlp_agreeable_counterfactuals = []
mlp_agreeable_counterfactuals_labels = []
# for i in range(len(repeat_mlp_negative_instances[repetition])):
i=0
base_counterfactuals = [repeat_discern_mlp_counterfactuals[repetition][i], np.array(repeat_dice_mlp_counterfactuals[repetition][i]).astype('float'), repeat_nice_mlp_counterfactuals[repetition][i], repeat_wachter_mlp_counterfactuals[repetition][i]]
labels = ['Discern', 'Dice', 'Nice', 'Wachter']
idx=0
for base_counterfactual, base_counterfactual_label in zip(base_counterfactuals, labels):
    if all(value == 0 for value in base_counterfactual):
        base_counterfactuals.pop(idx)
        labels.pop(idx)
        print(f'Removing {base_counterfactual_label} as does not have counterfactual.')
        idx += 1

data_to_explain = repeat_mlp_negative_instances[repetition].to_numpy()[i]

### Search for Counterfactuals using NSGA-II

In [12]:
evaluations = 10000
pop_size = 200
rseed=42

# Define the problem
problem = CounterfactualConsensus(
    data_instance=data_to_explain,
    base_counterfactuals=base_counterfactuals,
    categorical_features_idxs=categorical_feature_idxs,
    immutable_features_idxs=immutable_feature_idxs,
    continuous_features_idxs=continuous_feature_idxs,
    data=data,
    predict_fn=predict_fn,        
    predict_proba_fn=predict_proba_fn,
    disagreement_method="euclidean_distance",
    seed=rseed,                     
    parallel=False,                     
    wachter=False,   
    )    

# Initialise results manager. This is used to save results. base_folder is the dir to save the results
# rm = ResultsManager(target_problem=problem, solver_name="NSGA-II", rseed=rseed, base_folder="results") 

# Initialise NSGA-II solver
# solver = NSGAII(
#     problem=problem,
#     population_size=pop_size,
#     offspring_population_size=pop_size,
#     mutation=PolynomialMutation(probability=1.0 / problem.number_of_variables(), distribution_index=20),
#     crossover=SBXCrossover(probability=0.8, distribution_index=20),
#     termination_criterion=StoppingByEvaluations(max_evaluations=evaluations))

solver = NSGAIII(
        problem=problem,
        population_size=pop_size,
        reference_directions=UniformReferenceDirectionFactory(
            problem.number_of_objectives(),
            n_points=int(pop_size / 4)),
        mutation=PolynomialMutation(
            probability=1.0 / problem.number_of_variables(),
            distribution_index=20),
        crossover=SBXCrossover(
            probability=0.9,
            distribution_index=30),        
        termination_criterion=StoppingByEvaluations(max_evaluations=evaluations))

# Initialise results manager. This is used to save results. base_folder is the dir to save the results
rm = ResultsManager(target_problem=problem, solver_name=solver.get_name(), rseed=rseed, base_folder="results") 

# Initialise progress bar to show the optimisation progress
progress_bar = ProgressBarObserver(max=evaluations)
solver.observable.register(progress_bar)

# Run the optimisation 
solver.run()
solutions = solver.get_result()
front = get_non_dominated_solutions(solutions)

# Save results
rm.save_results(front)
print("Complete")

# Optionally plot results. This will onlt plot 2 objectives. Will fix later
# plot_front = Plot(title='Pareto front approximation', axis_labels=[problem.obj_labels[0], problem.obj_labels[1]])
# plot_front.plot(front, label=f'{solver.get_name()} {problem.name()}')




X does not have valid feature names, but MLPClassifier was fitted with feature names


X does not have valid feature names, but MLPClassifier was fitted with feature names

[2024-01-30 13:42:55,035] [jmetal.core.algorithm] [DEBUG] Creating initial set of solutions...
[2024-01-30 13:42:55,037] [jmetal.core.algorithm] [DEBUG] Evaluating solutions...

X does not have valid feature names, but MLPClassifier was fitted with feature names


X does not have valid feature names, but MLPClassifier was fitted with feature names


X does not have valid feature names, but MLPClassifier was fitted with feature names


X does not have valid feature names, but MLPClassifier was fitted with feature names


X does not have valid feature names, but MLPClassifier was fitted with feature names


X does not have valid feature names, but MLPClassifier was fitted with feature names


X does not have valid feature names, but MLPClassifier was fitted with feature names


X does not have valid feature names, bu