In [None]:
# coding=utf-8
#
# The copyright of this file belongs to Feedzai. The file cannot be
# reproduced in whole or in part, stored in a retrieval system,
# transmitted in any form, or by any means electronic, mechanical,
# photocopying, or otherwise, without the prior permission of the owner.
#
# (c) 2022 Feedzai, Strictly Confidential

In [None]:
import lightgbm as lgbm  # Tested ML method
import xgboost as xgb
import numpy as np       # Random number generation
import seaborn as sns    # Plotting library
import pandas as pd      # Read/write data
import yaml              # Read hyperparameter space configuration

from aequitas.group import Group                # Fairness metrics
from matplotlib import pyplot as plt            # Plotting method
from sklearn.preprocessing import LabelEncoder  # Categorical encoding for LGBM models
from sklearn import metrics                     # ROC metrics
from random_search import RandomValueTrial, suggest_callable_hyperparams  # Random search wrapper methods
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Read hyperparameter space for a given algorithm
ALGORITHMS = ["LGBM", "XGB", "LR", "RF", "DT"]

hyperparam_spaces = []
for a in ALGORITHMS:
    with open(f"../hyperparameter_spaces/{a}.yaml", "r") as file:
        hyperparam_spaces.append(yaml.load(file, Loader=yaml.FullLoader))

In [None]:
# Read the desired dataset. To reproduce each scenario in the paper, choose:
#   - Base dataset for the baseline
#   - Variant I for Scenario 1
#   - Variant II for Scenario 2
#   - Variant III for Scenario 3
#   - Variant V for Scenario 5
# Reproducibility for Scenarios 4 and 6 are a work in progress.
dataset = pd.read_csv("</path/to/dataset>")

In [None]:
# Define the label field and categorical columns.
label = "fraud_bool"

categorical_features = [
    "payment_type",
    "employment_status",
    "housing_status",
    "source",
    "device_os",
]

In [None]:
# Create the train and test sets. Shuffle data with `sample` method.
# The split was done by month. The first 6 months as the train, the last 2 months as test.
train_df = dataset[dataset["month"]<6].sample(frac=1, replace=False)
test_df = dataset[dataset["month"]>=6].sample(frac=1, replace=False)

In [None]:
# Encode the categorical variables in the datasets to integers. 
# This is expected by LGBM (or columns with the `categorical` data type).
for feat in categorical_features:
    encoder = LabelEncoder()
    encoder.fit(train_df[feat])  # Fit an encoder to the train set.
    train_df[feat] = encoder.transform(train_df[feat])  # Transform train set.
    test_df[feat] = encoder.transform(test_df[feat])    # Transform train set.


In [None]:
# Cell with train loop.

# Define number of trials in Random search.
n_trials=50
# Seeds for the random search sampling algorithm are the same as the paper.
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
seeds = np.random.randint(10**6, size=n_trials)

# Variable to store the results.
runs = {}

for algo, hyperparam_space, clf in zip(ALGORITHMS, hyperparam_spaces, [lgbm.LGBMClassifier, xgb.XGBClassifier, LogisticRegression, RandomForestClassifier, DecisionTreeClassifier]):
    for trial in range(n_trials):
        seed = seeds[trial]
        trial = RandomValueTrial(seed=seed)
        # Hyperparameters for the random search trial.
        test_hyperparams = suggest_callable_hyperparams(trial, hyperparam_space)
        del test_hyperparams["classpath"] # Remove unnecessary key in hyperparaemters.
        
        # Update list of tested hyperparameters.
        prev_hyperparams = runs.get("hyperparams", [])
        prev_hyperparams.append(test_hyperparams)
        runs["hyperparams"] = prev_hyperparams
    
        # Instantiate model
        model = clf(n_jobs=-1, **test_hyperparams)

        X_train = train_df.drop(columns=["fraud_bool"])
        y_train = train_df["fraud_bool"]
        X_test = test_df.drop(columns=["fraud_bool"])
        y_test = test_df["fraud_bool"]
        # Fit model to training data.
        if algo == "LGBM":
            model.fit(X_train, y_train, categorical_feature=categorical_features)
        else:
            model.fit(X_train, y_train)
        # Obtain predictions in test data.
        predictions = model.predict_proba(X_test)[:, 1]
            
        # Obtain ROC curve for the predictions.
        fprs, tprs, thresholds = metrics.roc_curve(y_test, predictions)
        # Obtain threshold and recall. We select 5% FPR as threshold as in the paper.
        threshold = np.min(thresholds[fprs==max(fprs[fprs < 0.05])])
        recall = np.max(tprs[fprs==max(fprs[fprs < 0.05])])
    
        # Binarize predictions for Aequitas.
        preds_binary = (predictions > threshold).astype(int)
            
        # Create a dataframe with protected group column, predictions and labels.
        # Here, we select age>=50 as threshold as in the paper.
        aequitas_df = pd.DataFrame(
            {
                "age": (X_test["customer_age"]>=50).map({True: "Older", False: "Younger"}),
                "preds": preds_binary,
                "y": y_test.values
            }
        )
            
        # Obtain FPR results for different groups.
        g = Group()
        aequitas_results = g.get_crosstabs(aequitas_df, attr_cols=["age"], score_col="preds", label_col="y")[0]
        
        # Store the results for the trained model
        results = {}
        # Performance metric used throughout the paper
        results["recall"] = recall
        # In the paper, we also collect group-wise comparisons of FNR and Precision.
        for m in ["fpr", "fnr", "ppv"]:
            m_older = aequitas_results[aequitas_results["attribute_value"] == "Older"][[m]].values[0][0]
            m_young = aequitas_results[aequitas_results["attribute_value"] == "Younger"][[m]].values[0][0]
            results[f"{m}_ratio"] = min(m_older, m_young) / max(m_older, m_young)
               
        # Store the results in the runs variable
        prev_runs = runs.get(algo, []) # Dataset name depends on scenario being run
        prev_runs.append(results)
        runs[algo] = prev_runs

In [None]:
# Create a dataframe with the results for each model in each dataset.
rs_results = pd.DataFrame(runs)

In [None]:
# Helper method to obtain the metric values for a given model.
def get_results(results, metric, algo_name):
    col = results[algo_name]
    values = []
    for idx, val in col.iteritems():
        values.append(val[metric])
    return values

In [None]:
# Obtain the relevant metrics to plots from the dataframe.
FAIRNESS_METRIC_TO_PLOT = "fpr_ratio"
plot_results = {"Algorithm": [], "Recall": [], "FPR Ratio": []}

for a in ALGORITHMS:
    plot_results["Recall"] += get_results(rs_results, "recall")
    plot_results["FPR Ratio"] += get_results(rs_results, FAIRNESS_METRIC_TO_PLOT)

    plot_results["Algorithm"] = [a] * len(plot_results["FPR Ratio"])

# Create a dataframe for easier plots.
plot_results = pd.DataFrame(plot_results)

In [None]:
# Create a plot with the full results of the random search algorithm.
sns.set()
sns.set_style("whitegrid", {"grid.linestyle": "--"})

sns.jointplot(data=plot_results, x="Recall", y="FPR Ratio", hue="Algorithm")
plt.ylim((0,1))

The plot that the cell above outputs should be an example of one of the experiments ran in the paper, for all the algorithms and HP configurations used.