In [1]:
import numpy as np
import pandas as pd
import optuna
from sklearn.preprocessing import MinMaxScaler
import xgboost

import gower

from utils import encode_features, get_train_test_data, train_model, evaluate_model, generate_individual, epsilon_rounding, get_relevant_candidates

optuna.logging.set_verbosity(optuna.logging.WARNING)

## Data

In [19]:
def load_data(data_filepath="../data/Loan_data_extracted.csv"):
    """
    Input: path to .csv data file

    TODO: specify in feature_info whether features are of type:
        fixed, meaning cannot change for the counterfactual
        unique, meaning can only take existing categorical values
        increase, meaning their value can only increase and not decrease
        range, meaning their new value can take a range of values

    Returns:
        dataframe and feature configuration dictionary
    """
    df = pd.read_csv(data_filepath)
    df = df.drop('Loan_ID', axis=1)
    df = df.dropna()

    feature_config = {
        "categorical": ["Gender", "Married", "Education", "Self_Employed", "Property_Area", "Loan_Status"],

        "feature_info": [
            ('Gender', 'unique'),
            ('Married', 'unique'),
            ('Dependents', 'fixed'),
            ('Education', 'unique'),
            ('Self_Employed', 'unique'),
            ('ApplicantIncome', 'range'),
            ('CoapplicantIncome', 'range'),
            ('LoanAmount', 'range'),
            ('Loan_Amount_Term', 'range'),
            ('Credit_History', 'unique'),
            ('Property_Area', 'unique'),
        ],

        "categorical_features": ["Gender", "Married", "Education", "Self_Employed", "Property_Area"]
    }

    return df, feature_config

## Model

In [20]:
# Load the model from the saved file
model = xgboost.XGBClassifier()
model.load_model("../models/xgboost_model.json")

## TODO: Code for counterfactual search

In [21]:
def misfit(x_prime, y_target, model):
    """
    Optimisation criterion 1
    Calculate absolute difference between y_target and y_prime_prediction.

    This measures the desirability of the counterfactual
    """

    #TODO
    y_predict = model.predict(x_prime)
    closeness = abs(y_predict - y_target)

    return closeness

In [22]:
def distance(X, x, x_prime, numerical, categorical):
    """
    Optimisation criterion 2
    Calculate distance between x_prime and x.

    This measures the closeness of the counterfactual
    """
    # Normalize data
    scaler = MinMaxScaler()
    scaler.fit(X[numerical])
    x_normalized = scaler.transform(x[numerical])
    x_prime_normalized = scaler.transform(x_prime[numerical])

    # Compute distances
    #TODO
    gower_distance_matrix = gower.gower_matrix(np.vstack((
        x_normalized, 
        x_prime_normalized
        )))
    distance = gower_distance_matrix[0, 1]

    return distance

In [23]:
def sparsity(x, x_prime):
    """
    Optimisation criterion 3
    Return number of unchanged features.

    This measures the sparsity of changes producing the counterfactual
    """
    #TODO

    idx_no_diff = np.where(x == x_prime)[0]
    num_no_diff = len(idx_no_diff)

    return num_no_diff

In [24]:
def closest_real(X, x_prime, categorical, numerical):
    """
    Optimisation criterion 4
    Return the minimum distance between x_prime and any point in X.

    This measures the actionability of the counterfactual
    """
    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X[numerical])
    x_prime_normalized = scaler.transform(x_prime[numerical])

    # Compute total distance
    #TODO
    category_idxs = np.where(X[:, -1] == categorical)
    anchor_idx = np.random.choice(category_idxs, size=(1,))
        
    # distance = (np.sqrt(np.sum(x_prime - X[anchor_idx])**2))     # Euclidean distance between x_prime and any point in x
    gower_dist_matrix =  gower.gower_matrix(np.vstack((            # Gower distance between x_prime and any point in x
        x_prime,
        X[anchor_idx]
    )))
    distance = gower_dist_matrix[0, 1]

    return distance

In [25]:
def objective(trial, X, x, features, model, y_target, numerical, categorical):
    x_prime = x.copy()
    for feature in features:
        feature.sample(trial)
        x_prime[feature.name] = feature.value
    epsilon_rounding(x, x_prime, 1e-1)

    obj1 = misfit(x_prime, y_target, model)
    obj2 = distance(X, x, x_prime, numerical, categorical)
    obj3 = sparsity(x, x_prime)
    obj4 = closest_real(X, x_prime, categorical, numerical)

    return obj1, obj2, obj3, obj4

In [26]:
def get_counterfactuals(X, x, y_target, model,
                        numerical, categorical, features,
                        tol, optimization_steps, timeout):

    study = optuna.create_study(directions=['minimize', 'minimize', 'maximize', 'minimize'],
                                sampler=optuna.samplers.NSGAIISampler(seed=42))

    study.optimize(lambda trial: objective(trial, X, x, features, model,
                                           y_target,
                                           numerical,
                                           categorical),
                   n_trials=optimization_steps,
                   timeout=timeout)

    candidates_df = get_relevant_candidates(study, x, model, y_target, tol)

    return candidates_df

## Provided datapoint and data

In [27]:
X_obs, feat_conf = load_data("../data/Loan_data_extracted.csv")
X_obs = encode_features(X_obs, feat_conf["categorical"])

--- 
Encoded categorical features as follows:
Gender :  {'Female': 0, 'Male': 1}
Married :  {'No': 0, 'Yes': 1}
Education :  {'Not Graduate': 0, 'Graduate': 1}
Self_Employed :  {'No': 0, 'Yes': 1}
Property_Area :  {'Rural': 0, 'Semiurban': 1, 'Urban': 2}
Loan_Status :  {'N': 0, 'Y': 1}
---


In [28]:
customer = np.array([0,1,0,0,0,2000,1500,1000,480,0,1])
x = pd.DataFrame([customer], columns=X_obs.columns[:-1].tolist())

In [29]:
# Check that our customer x did not get the loan

# TODO
y_pred = model.predict_proba(x)[:, 1].item()
print(f"Probability of user x getting loan: {y_pred:.4f}")

# and help her find out what she has to do in order to get the loan
# If you have implemented everything above correctly, the code below
# will find the counterfactuals

Probability of user x getting loan: 0.0030


## Search for counterfactuals

In [30]:
feat_conf["feature_info"]

[('Gender', 'unique'),
 ('Married', 'unique'),
 ('Dependents', 'fixed'),
 ('Education', 'unique'),
 ('Self_Employed', 'unique'),
 ('ApplicantIncome', 'range'),
 ('CoapplicantIncome', 'range'),
 ('LoanAmount', 'range'),
 ('Loan_Amount_Term', 'range'),
 ('Credit_History', 'unique'),
 ('Property_Area', 'unique')]

In [31]:
# Make a list of Feature objects containing information about how
# each feature is allowed to change when generating counterfactuals
change_features = generate_individual(X_obs, x, feat_conf["feature_info"])

In [32]:
# Set the desired new model prediction
y_CF = 0.7
print(f"Searching for counterfactuals with y_CF = {y_CF}...\n")
numerical_features = [x for x in df.columns if x not in feat_conf["categorical"]]
CFS = get_counterfactuals(X_obs, x, y_CF, model,
                          numerical_features,
                          feat_conf["categorical_features"],
                          change_features,
                          tol=0.05,
                          optimization_steps=500,
                          timeout=None)

Searching for counterfactuals with y_CF = 0.7...



NameError: name 'df' is not defined

In [None]:
x

In [None]:
CFS