## Implementation the dual gradient method for COMPAS and Adult Credit

In [1]:
import os
import sys
import numpy as np
module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from src.util import mean_difference, get_list_of_seeds
from src.feature_map import IdentityFeatureMap
from src.functions import cost_utility
from src.plotting import plot_mean, plot_median
from src.training import train
from src.training_evaluation import UTILITY, COVARIANCE_OF_DECISION_DP
from src.policy import LogisticPolicy
from src.distribution import AdultCreditDistribution, COMPASDistribution, FICODistribution
from src.optimization import ManualGradientLagrangianOptimizationTarget

unable to import 'smart_open.gcs', disabling that module


## Fairness Functions

In [2]:
def calc_benefit(decisions, ips_weights):
    if ips_weights is not None:
        decisions *= ips_weights

    return decisions


def calc_covariance(s, decisions, ips_weights):
    new_s = 1 - (2 * s)

    if ips_weights is not None:
        mu_s = np.mean(new_s * ips_weights, axis=0)
        d = decisions * ips_weights
    else:
        mu_s = np.mean(new_s, axis=0)
        d = decisions

    covariance = (new_s - mu_s) * d
    return covariance


def fairness_function_gradient(type, **fairness_kwargs):
    policy = fairness_kwargs["policy"]
    x = fairness_kwargs["x"]
    s = fairness_kwargs["s"]
    y = fairness_kwargs["y"]
    decisions = fairness_kwargs["decisions"]
    ips_weights = fairness_kwargs["ips_weights"]

    if type == "BD_DP" or type == "BD_EOP":
        result = calc_benefit(decisions, ips_weights)
    elif type == "COV_DP":
        result = calc_covariance(s, decisions, ips_weights)
    elif type == "COV_DP_DIST":
        phi = policy.feature_map(policy._extract_features(x, s))
        distance = np.matmul(phi, policy.theta).reshape(-1, 1)
        result = calc_covariance(s, distance, ips_weights)

    log_gradient = policy.log_policy_gradient(x, s)
    grad = log_gradient * result

    if type == "BD_DP":
        return mean_difference(grad, s)
    elif type == "COV_DP":
        return np.mean(grad, axis=0)
    elif type == "COV_DP_DIST":
        return np.mean(grad, axis=0)
    elif type == "BD_EOP":
        y1_indices = np.where(y == 1)
        return mean_difference(grad[y1_indices], s[y1_indices])


def fairness_function(type, **fairness_kwargs):
    x = fairness_kwargs["x"]
    s = fairness_kwargs["s"]
    y = fairness_kwargs["y"]
    decisions = fairness_kwargs["decisions"]
    ips_weights = fairness_kwargs["ips_weights"]
    policy = fairness_kwargs["policy"]

    if type == "BD_DP":
        benefit = calc_benefit(decisions, ips_weights)
        return mean_difference(benefit, s)
    elif type == "COV_DP":
        covariance = calc_covariance(s, decisions, ips_weights)
        return np.mean(covariance, axis=0)
    elif type == "COV_DP_DIST":
        phi = policy.feature_map(policy._extract_features(x, s))
        distance = np.matmul(phi, policy.theta).reshape(-1, 1)
        covariance = calc_covariance(s, distance, ips_weights)
        return np.mean(covariance, axis=0)
    elif type == "BD_EOP":
        benefit = calc_benefit(decisions, ips_weights)
        y1_indices = np.where(y == 1)
        return mean_difference(benefit[y1_indices], s[y1_indices])

# COMPAS

In [None]:
bias = True
distribution = COMPASDistribution(bias=bias, test_percentage=0.2)
dim_theta = distribution.feature_dim

def util_func(**util_params):
    util = cost_utility(cost_factor=0.5, **util_params)
    return util

training_parameters = {
    'model': LogisticPolicy(np.zeros(dim_theta), IdentityFeatureMap(dim_theta), False),
    'distribution': distribution,
    'parameter_optimization': {
        'time_steps':20,
        'epochs': 10,
        'batch_size':128,
        'learning_rate': 0.1,
        'learn_on_entire_history': False,
        'clip_weights': True
    },
    'data': {
        'num_train_samples': 4096,
        'num_test_samples': 1024
    },
    'evaluation': {
        UTILITY: {
            'measure_function': lambda s, y, decisions : np.mean(util_func(s=s,
                                                                           y=y,
                                                                           decisions=decisions)),
            'detailed': False
        },
        COVARIANCE_OF_DECISION_DP: {
            'measure_function': lambda s, y, decisions : fairness_function(
                type="COV_DP",
                x=None,
                s=s,
                y=y,
                decisions=decisions,
                ips_weights=None,
                policy=None),
            'detailed': False
        }
    }
}


if os.path.isfile('./cluster_experiments/seeds.npz'):
    seeds = np.load('./cluster_experiments/seeds.npz')
    training_parameters['data']["training_seeds"] = seeds["train"]
    training_parameters['data']["test_seed"] = seeds["test"]
else:
    seeds = {}
    train_seeds = get_list_of_seeds(200)
    test_seeds = get_list_of_seeds(1)
    training_parameters['data']["training_seeds"] = train_seeds
    training_parameters['data']["test_seed"] = test_seeds
    np.savez('./cluster_experiments/seeds.npz', train=train_seeds, test=test_seeds)

### Benefit Difference: DP

In [None]:
training_parameters["optimization_target"] = \
    ManualGradientLagrangianOptimizationTarget(0.0,
                                            lambda **util_params: cost_utility(cost_factor=0.5, **util_params),
                                            lambda **util_params: fairness_function("BD_DP", **util_params),
                                            lambda **util_params: fairness_function_gradient("BD_DP", **util_params))

training_parameters['lagrangian_optimization'] = {
    'epochs': 1,
    'batch_size':128,
    'learning_rate': 0.1,
}
training_parameters["save_path"] = "../res/local_experiments/DUAL_GRADIENT/COMPAS/BD_DP"
statistics, model_parameters, run_path = train(
    training_parameters,
    iterations=2,
    asynchronous=False)

plot_mean(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_mean_time.png".format(run_path))
plot_median(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_median_time.png".format(run_path))

### Covariance of decision: DP

In [None]:
training_parameters["optimization_target"] = \
    ManualGradientLagrangianOptimizationTarget(0.0,
                                            lambda **util_params: cost_utility(cost_factor=0.5, **util_params),
                                            lambda **util_params: fairness_function("COV_DP", **util_params),
                                            lambda **util_params: fairness_function_gradient("COV_DP", **util_params))

training_parameters['lagrangian_optimization'] = {
    'epochs': 50,
    'batch_size':64,
    'learning_rate': 0.00001,
}

training_parameters["save_path"] = "../res/local_experiments/DUAL_GRADIENT/COMPAS/COV_DP"
statistics, model_parameters, run_path = train(
    training_parameters,
    iterations=15,
    asynchronous=True)

plot_mean(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_mean_time.png".format(run_path))
plot_median(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_median_time.png".format(run_path))

# Adult Credit Data

In [None]:
bias = True
distribution = AdultCreditDistribution(bias=bias, test_percentage=0.2)
dim_theta = distribution.feature_dim

def util_func(**util_params):
    util = cost_utility(cost_factor=0.5, **util_params)
    return util

training_parameters = {
    'model': LogisticPolicy(np.zeros(dim_theta), IdentityFeatureMap(dim_theta), False),
    'distribution': distribution,
    'parameter_optimization': {
        'time_steps':200,
        'epochs': 50,
        'batch_size':512,
        'learning_rate': 0.05,
        'learn_on_entire_history': False,
        'clip_weights': True
    },
    'data': {
        'num_train_samples': 16384,
        'num_test_samples': 4096
    },
    'evaluation': {
        UTILITY: {
            'measure_function': lambda s, y, decisions : np.mean(util_func(s=s,
                                                                           y=y,
                                                                           decisions=decisions)),
            'detailed': False
        },
        COVARIANCE_OF_DECISION_DP: {
            'measure_function': lambda s, y, decisions : fairness_function(
                type="COV_DP",
                x=None,
                s=s,
                y=y,
                decisions=decisions,
                ips_weights=None,
                policy=None),
            'detailed': False
        }
    }
}

if os.path.isfile('./cluster_experiments/seeds.npz'):
    seeds = np.load('./cluster_experiments/seeds.npz')
    training_parameters['data']["training_seeds"] = seeds["train"]
    training_parameters['data']["test_seed"] = seeds["test"]
else:
    seeds = {}
    train_seeds = get_list_of_seeds(200)
    test_seeds = get_list_of_seeds(1)
    training_parameters['data']["training_seeds"] = train_seeds
    training_parameters['data']["test_seed"] = test_seeds
    np.savez('./cluster_experiments/seeds.npz', train=train_seeds, test=test_seeds)

### Benefit Difference: DP

In [None]:
training_parameters["optimization_target"] = \
    ManualGradientLagrangianOptimizationTarget(0.0,
                                            lambda **util_params: cost_utility(cost_factor=0.5, **util_params),
                                            lambda **util_params: fairness_function("BD_DP", **util_params),
                                            lambda **util_params: fairness_function_gradient("BD_DP", **util_params))
training_parameters["save_path"] = "../res/local_experiments/DUAL_GRADIENT/ADULT/BD_DP"

training_parameters['lagrangian_optimization'] = {
    'epochs': 50,
    'batch_size':512,
    'learning_rate': 0.4
}

statistics, model_parameters, run_path = train(
    training_parameters,
    iterations=15,
    asynchronous=True)

plot_mean(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_mean_time.png".format(run_path))
plot_median(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_median_time.png".format(run_path))

### Covariance of decision: DP

In [None]:
training_parameters["optimization_target"] = \
    ManualGradientLagrangianOptimizationTarget(0.0,
                                            lambda **util_params: cost_utility(cost_factor=0.5, **util_params),
                                            lambda **util_params: fairness_function("COV_DP", **util_params),
                                            lambda **util_params: fairness_function_gradient("COV_DP", **util_params))
training_parameters["save_path"] = "../res/local_experiments/DUAL_GRADIENT/ADULT/COV_DP"

training_parameters['lagrangian_optimization'] = {
    'epochs': 1,
    'batch_size':256,
    'learning_rate': 0.000001
}

statistics, model_parameters, run_path = train(
    training_parameters,
    iterations=15,
    asynchronous=True)

plot_mean(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_mean_time.png".format(run_path))
plot_median(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_median_time.png".format(run_path))

# FICO Credit Score Data

In [3]:
bias = True
distribution = FICODistribution(bias=bias, fraction_protected=0.5)
dim_theta = distribution.feature_dimension

def util_func(**util_params):
    util = cost_utility(cost_factor=0.5, **util_params)
    return util

training_parameters = {
    'model': LogisticPolicy(np.zeros(dim_theta), IdentityFeatureMap(dim_theta), False),
    'distribution': distribution,
    'parameter_optimization': {
        'time_steps':200,
        'epochs': 50,
        'batch_size':128,
        'learning_rate': 0.01,
        'learn_on_entire_history': False,
        'clip_weights': True
    },
    'data': {
        'num_train_samples': 4096,
        'num_test_samples': 1024
    },
    'evaluation': {
        UTILITY: {
            'measure_function': lambda s, y, decisions : np.mean(util_func(s=s,
                                                                           y=y,
                                                                           decisions=decisions)),
            'detailed': False
        },
        COVARIANCE_OF_DECISION_DP: {
            'measure_function': lambda s, y, decisions : fairness_function(
                type="COV_DP",
                x=None,
                s=s,
                y=y,
                decisions=decisions,
                ips_weights=None,
                policy=None),
            'detailed': False
        }
    }
}

if os.path.isfile('./cluster_experiments/seeds.npz'):
    seeds = np.load('./cluster_experiments/seeds.npz')
    training_parameters['data']["training_seeds"] = seeds["train"]
    training_parameters['data']["test_seed"] = seeds["test"]
else:
    seeds = {}
    train_seeds = get_list_of_seeds(200)
    test_seeds = get_list_of_seeds(1)
    training_parameters['data']["training_seeds"] = train_seeds
    training_parameters['data']["test_seed"] = test_seeds
    np.savez('./cluster_experiments/seeds.npz', train=train_seeds, test=test_seeds)

### Benefit Difference: DP

In [4]:
training_parameters["optimization_target"] = \
    ManualGradientLagrangianOptimizationTarget(0.0,
                                            lambda **util_params: cost_utility(cost_factor=0.5, **util_params),
                                            lambda **util_params: fairness_function("BD_DP", **util_params),
                                            lambda **util_params: fairness_function_gradient("BD_DP", **util_params))

training_parameters['lagrangian_optimization'] = {
    'epochs': 50,
    'batch_size':64,
    'learning_rate': 0.001,
}
training_parameters["save_path"] = "../res/local_experiments/DUAL_GRADIENT/FICO/BD_DP"
statistics, model_parameters, run_path = train(
    training_parameters,
    iterations=15,
    asynchronous=True)

plot_mean(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_mean_time.png".format(run_path))
plot_median(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_median_time.png".format(run_path))

## STARTED // LR = 0.01 // TS = 200 // E = 50 // BS = 128 // FR = 0.0 ##
Traceback (most recent call last):
  File "/home/fkretschmar/miniconda3/envs/masters-thesis/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-3b6720e9bff1>", line 16, in <module>
    asynchronous=True)
  File "/home/fkretschmar/Documents/master-thesis/src/training.py", line 246, in train
    asynchronous)
  File "/home/fkretschmar/Documents/master-thesis/src/training.py", line 199, in train_over_iterations
    results_per_iterations.append(result.get())
  File "/home/fkretschmar/miniconda3/envs/masters-thesis/lib/python3.7/site-packages/multiprocess/pool.py", line 651, in get
    self.wait(timeout)
  File "/home/fkretschmar/miniconda3/envs/masters-thesis/lib/python3.7/site-packages/multiprocess/pool.py", line 648, in wait
    self._event.wait(timeout)
  File "/home/fkretschmar/miniconda3/envs/masters-

Process ForkPoolWorker-7:
Process ForkPoolWorker-16:
Process ForkPoolWorker-15:
Process ForkPoolWorker-5:
  File "/home/fkretschmar/miniconda3/envs/masters-thesis/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
    self.run()
Process ForkPoolWorker-14:
Process ForkPoolWorker-1:
Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Process ForkPoolWorker-6:
Process ForkPoolWorker-13:
Process ForkPoolWorker-8:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/fkretschmar/miniconda3/envs/masters-thesis/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
    self.run()
  File "/home/fkretschmar/miniconda3/envs/masters-thesis/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent c

KeyboardInterrupt: 

### Covariance of decision: DP

In [None]:
training_parameters["optimization_target"] = \
    ManualGradientLagrangianOptimizationTarget(0.0,
                                            lambda **util_params: cost_utility(cost_factor=0.5, **util_params),
                                            lambda **util_params: fairness_function("COV_DP", **util_params),
                                            lambda **util_params: fairness_function_gradient("COV_DP", **util_params))

training_parameters['lagrangian_optimization'] = {
    'epochs': 50,
    'batch_size':128,
    'learning_rate': 0.0001,
}

training_parameters["save_path"] = "../res/local_experiments/DUAL_GRADIENT/FICO/COV_DP"
statistics, model_parameters, run_path = train(
    training_parameters,
    iterations=15,
    asynchronous=True)

plot_mean(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_mean_time.png".format(run_path))
plot_median(x_values=range(training_parameters["parameter_optimization"]["time_steps"] + 1),
          x_label="Time steps",
          x_scale="linear",
          performance_measures=[statistics.get_additonal_measure(UTILITY, "Utility"),
                                statistics.demographic_parity(),
                                statistics.equality_of_opportunity(),
                                statistics.get_additonal_measure(COVARIANCE_OF_DECISION_DP,
                                                              "Covariance of Decision (DP)")],
          fairness_measures=[],
          file_path="{}/results_median_time.png".format(run_path))