# Fair and Robust Sample Selection on the Synthetic Dataset
## With Label Flipping

#### This Jupyter Notebook simulates the proposed fair and robust sample selection on the synthetic data.
#### We use two fairness metrics: equalized odds and demographic parity.

## Import libraries

In [2]:
!pwd

/Users/lodino/Desktop/work/workspace/newest_data_error/data-err-experiment/robust_algorithms/fair-robust-selection


In [3]:
import sys, os
import numpy as np
import math
import random
import itertools
import copy

from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
import torch

from models import LogisticRegression, weights_init_normal, test_model
from FairRobustSampler import FairRobust, CustomDataset

from argparse import Namespace

import warnings
warnings.filterwarnings("ignore")

In [4]:
os.chdir('../../')
from load_dataset import load

## Load and process the data
In the synthetic_data directory, there are a total of 11 numpy files including training data (both clean and noisy), validation data, and test data. Note that the validation data is utilized for another method in the paper (i.e., FR-Train), so the data is not used in this program.

In [4]:
from API_Design_a import MissingValueError, SamplingError, LabelError, Injector
# create pattern function given subpopulation
def create_pattern(col_list, lb_list, ub_list):
    # Check if inputs are valid
    try:
        assert len(col_list) == len(lb_list) == len(ub_list)
    except:
        print(col_list, lb_list, ub_list)
        raise SyntaxError

    def pattern(data_X, data_y):
        # Initialize a mask of all True values
        mask = np.ones(len(data_X), dtype=bool)

        # Iterate over each condition in col_list, lb_list, and ub_list
        for col, lb, ub in zip(col_list, lb_list, ub_list):
            if col == 'Y':
                mask &= (data_y >= lb) & (data_y <= ub)
            else:
                mask &= (data_X[col] >= lb) & (data_X[col] <= ub)

        # Convert Boolean mask to binary indicators (1 for True, 0 for False)
        binary_indicators = mask.astype(int)
        
        return binary_indicators

    return pattern


# lb_list = [0, 0]
# ub_list = [0, 0]
# X_train, X_test, y_train, y_test = load(dataset)
# X_train_orig, X_test_orig = X_train.copy(), X_test.copy()

# X_train_orig.reset_index(drop=True, inplace=True)
# X_test_orig.reset_index(drop=True, inplace=True)
# y_train.reset_index(drop=True, inplace=True)
# y_test.reset_index(drop=True, inplace=True)

# mv_pattern = create_pattern(['race', 'Y'], lb_list, ub_list)
# mv_pattern_len = np.sum(mv_pattern(X_train_orig, y_train))
# mv_num = min(mv_pattern_len, int(0.4*len(X_train_orig)))
# mv_err = MissingValueError(list(X_train_orig.columns).index('race'), mv_pattern, mv_num / mv_pattern_len)

# injecter = Injector(error_seq=[mv_err])
# dirty_X_train_orig, dirty_y_train, _, _ = injecter.inject(X_train_orig.copy(), y_train.copy(), 
#                                                           X_train_orig, y_train, seed=seed)

In [5]:
X_train, X_test, y_train, y_test = load('adult')

lb_list = [6, 1, 0]
ub_list = [8, 1, 0]

mv_pattern = create_pattern(['education', 'gender', 'Y'], lb_list, ub_list)
mv_pattern_len = np.sum(mv_pattern(X_train, y_train))
poi_ratio = 0.1
mv_num = min(mv_pattern_len, int(poi_ratio*len(X_train)))
mv_err = LabelError(mv_pattern, mv_num / mv_pattern_len)
injector = Injector(error_seq=[mv_err])
X_train, y_train, _, _ = injector.inject(X_train, y_train, X_train, y_train, seed=0)

y_train = y_train.replace({0: -1, 1: 1})
y_test = y_test.replace({0: -1, 1: 1})

xz_train = X_train.copy()
z_train = X_train.gender.copy()
y_noise = y_train.copy()

xz_test = X_test.copy()
z_test = X_test.gender.copy()

# y_train = y_train*2-1
# y_test = y_test*2-1

xz_train = torch.FloatTensor(xz_train.to_numpy())
y_train = torch.FloatTensor(y_train.to_numpy())
z_train = torch.FloatTensor(z_train.to_numpy())

y_noise = torch.FloatTensor(y_noise.to_numpy())

xz_test = torch.FloatTensor(xz_test.to_numpy())
y_test = torch.FloatTensor(y_test.to_numpy())
z_test = torch.FloatTensor(z_test.to_numpy())

# os.chdir('robust_algorithms/fair-robust-selection')
# xz_train = np.load('./synthetic_data/xz_train.npy')
# y_train = np.load('./synthetic_data/y_train.npy')
# z_train = np.load('./synthetic_data/z_train.npy')
# 
# y_noise = np.load('./synthetic_data/y_noise_general.npy') # Labels with the general label flipping (details are in the paper)
# poi_ratio = 0.0
# 
# xz_test = np.load('./synthetic_data/xz_test.npy')
# y_test = np.load('./synthetic_data/y_test.npy') 
# z_test = np.load('./synthetic_data/z_test.npy')
# 
# xz_train = torch.FloatTensor(xz_train)
# y_train = torch.FloatTensor(y_train)
# z_train = torch.FloatTensor(z_train)
# 
# y_noise = torch.FloatTensor(y_noise)
# 
# xz_test = torch.FloatTensor(xz_test)
# y_test = torch.FloatTensor(y_test)
# z_test = torch.FloatTensor(z_test)
# os.chdir('../../')

In [6]:
print("---------- Number of Data ----------" )
print(
    "Train data : %d, Test data : %d "
    % (len(y_train), len(y_test))
)       
print("------------------------------------")

---------- Number of Data ----------
Train data : 30162, Test data : 15060 
------------------------------------


## Training function

In [7]:
def run_epoch(model, train_features, labels, optimizer, criterion):
    """Trains the model with the given train data.

    Args:
        model: A torch model to train.
        train_features: A torch tensor indicating the train features.
        labels: A torch tensor indicating the true labels.
        optimizer: A torch optimizer.
        criterion: A torch criterion.

    Returns:
        loss values.
    """
    
    optimizer.zero_grad()

    label_predicted = model.forward(train_features)
    loss  = criterion((F.tanh(label_predicted.squeeze())+1)/2, (labels.squeeze()+1)/2)
    loss.backward()

    optimizer.step()
    
    return loss.item()

# 1. Fair and Robust Sample Selection w.r.t. Equalized Odds
### The results are in the Experiments section of the paper.

In [7]:
full_tests = []

parameters = Namespace(warm_start=100, tau=1-poi_ratio, alpha = 0.001, batch_size = 100)

# Set the train data
train_data = CustomDataset(xz_train, y_noise, z_train)

seed  = 0
# ---------------------
#  Initialize model, optimizer, and criterion
# ---------------------

model = LogisticRegression(xz_train.shape[1],1)

torch.manual_seed(seed)
model.apply(weights_init_normal)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.999))
criterion = torch.nn.BCELoss()

losses = []

# ---------------------
#  Define FairRobust and DataLoader
# ---------------------

sampler = FairRobust (model, train_data.x, train_data.y, train_data.z, target_fairness = 'eqodds', parameters = parameters, replacement = False, seed = seed)
train_loader = torch.utils.data.DataLoader (train_data, sampler=sampler, num_workers=0)

# ---------------------
#  Model training
# ---------------------
for epoch in range(400):
    print(epoch, end="\r")
    
    tmp_loss = []
    
    for batch_idx, (data, target, z) in enumerate (train_loader):
        loss = run_epoch (model, data, target, optimizer, criterion)
        tmp_loss.append(loss)
    
    losses.append(sum(tmp_loss)/len(tmp_loss))

tmp_test = test_model(model, xz_test, y_test, z_test)
full_tests.append(tmp_test)

# print("  Test accuracy: {}, EO disparity: {}".format(tmp_test['Acc'], tmp_test['EqOdds_diff']))
print("----------------------------------------------------------------------")

  Test accuracy: 0.756905734539032, EO disparity: 0.11056257194221342
----------------------------------------------------------------------


In [8]:
pred_digits = model(xz_test).detach().numpy()
from sklearn.metrics import roc_auc_score
roc_auc_score((y_test + 1) / 2, 1/(1+np.exp(-pred_digits)))

0.8280654739246289

In [9]:
# measure equal opportunity, i.e. difference in true positive rates for the two groups
tpr_privileged = np.mean((pred_digits>0)[X_test.gender == 1])
tpr_unprivileged = np.mean((pred_digits>0)[X_test.gender == 0])
eq_opp = tpr_privileged - tpr_unprivileged
eq_opp

0.2694776165494446

# AIF360 fairness algorithms

### Reweighing

In [5]:
from aif360.algorithms.preprocessing import Reweighing
from aif360.datasets import BinaryLabelDataset
import pandas as pd

# use the same dataset
privileged_groups = [{'gender': 1}]
unprivileged_groups = [{'gender': 0}]
X_train_reweighed = BinaryLabelDataset(df=pd.concat([X_train, pd.Series(y_train.detach().numpy(), name='Y')], axis=1), 
                                       label_names=['Y'], protected_attribute_names=['gender'],
                                       favorable_label=1, unfavorable_label=-1)

RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
RW.fit(X_train_reweighed)
X_train_reweighed = RW.transform(X_train_reweighed)

SystemError: initialization of _internal failed without raising an exception

In [None]:
from sklearn.linear_model import LogisticRegression as SKLR
clf = SKLR(random_state=42)
clf.fit(X_train, y_train, sample_weight=X_train_reweighed.instance_weights)
y_pred = clf.predict(X_test)

# measure equal opportunity
tpr_privileged = np.mean((y_pred>0.5)[X_test.gender == 1])
tpr_unprivileged = np.mean((y_pred>0.5)[X_test.gender == 0])
eq_opp = tpr_privileged - tpr_unprivileged

eq_opp, roc_auc_score(y_test, y_pred)


### LFR

In [None]:
from aif360.algorithms.preprocessing.lfr import LFR

# use the same dataset
privileged_groups = [{'gender': 1}]
unprivileged_groups = [{'gender': 0}]
X_train_wrapped = BinaryLabelDataset(df=pd.concat([X_train, pd.Series(y_train.detach().numpy(), name='Y')], axis=1), 
                                     label_names=['Y'], protected_attribute_names=['gender'],
                                     favorable_label=1, unfavorable_label=-1)
X_train_wrapped = BinaryLabelDataset(df=pd.concat([X_train, pd.Series(y_train.detach().numpy(), name='Y')], axis=1), 
                                     label_names=['Y'], protected_attribute_names=['gender'],
                                     favorable_label=1, unfavorable_label=-1)
preproc = LFR(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups, k=10, Ax=0.1, Ay=2.0, Az=1.0, verbose=1, seed=1)
preproc.fit(X_train_wrapped, maxiter=3000, maxfun=3000)
X_train_transformed = preproc.transform(X_train_wrapped)


In [None]:
clf = SKLR(random_state=42)
clf.fit(X_train_transformed.to_dataframe()[0], y_train)
y_pred = clf.predict(X_test)

# measure equal opportunity
tpr_privileged = np.mean((y_pred>0.5)[X_test.gender == 1])
tpr_unprivileged = np.mean((y_pred>0.5)[X_test.gender == 0])
eq_opp = tpr_privileged - tpr_unprivileged

eq_opp, roc_auc_score(y_test, y_pred)


# Fair and Robust Sample Selection w.r.t. Demographic Parity
### The results are in the Experiments section of the paper.

In [8]:
full_tests = []

parameters = Namespace(warm_start=100, tau=1-poi_ratio, alpha = 0.001, batch_size = 100)

# Set the train data
train_data = CustomDataset(xz_train, y_noise, z_train)

seeds = [0,1,2,3,4]

for seed in seeds:
    
    print("< Seed: {} >".format(seed))
    
    # ---------------------
    #  Initialize model, optimizer, and criterion
    # ---------------------
    
    model = LogisticRegression(3,1).cuda()

    torch.manual_seed(seed)
    model.apply(weights_init_normal)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.999))
    criterion = torch.nn.BCELoss()

    losses = []
    
    # ---------------------
    #  Define FairRobust and DataLoader
    # ---------------------

    sampler = FairRobust (model, train_data.x, train_data.y, train_data.z, target_fairness = 'dp', parameters = parameters, replacement = False, seed = seed)
    train_loader = torch.utils.data.DataLoader (train_data, sampler=sampler, num_workers=0)

    # ---------------------
    #  Model training
    # ---------------------
    for epoch in range(400):
        print(epoch, end="\r")
        
        tmp_loss = []
        
        for batch_idx, (data, target, z) in enumerate (train_loader):
            loss = run_epoch (model, data, target, optimizer, criterion)
            tmp_loss.append(loss)
            
        losses.append(sum(tmp_loss)/len(tmp_loss))
        
    tmp_test = test_model(model, xz_test, y_test, z_test)
    full_tests.append(tmp_test)
    
    print("  Test accuracy: {}, DP disparity: {}".format(tmp_test['Acc'], tmp_test['DP_diff']))
    print("----------------------------------------------------------------------")

< Seed: 0 >
  Test accuracy: 0.7190000414848328, DP disparity: 0.005151515151515129
----------------------------------------------------------------------
< Seed: 1 >
  Test accuracy: 0.7200000286102295, DP disparity: 0.00615151515151513
----------------------------------------------------------------------
< Seed: 2 >
  Test accuracy: 0.7210000157356262, DP disparity: 0.00410101010101005
----------------------------------------------------------------------
< Seed: 3 >
  Test accuracy: 0.7200000286102295, DP disparity: 0.00615151515151513
----------------------------------------------------------------------
< Seed: 4 >
  Test accuracy: 0.718000054359436, DP disparity: 0.0072020202020202095
----------------------------------------------------------------------


In [9]:
tmp_acc = []
tmp_dp = []
for i in range(len(seeds)):
    tmp_acc.append(full_tests[i]['Acc'])
    tmp_dp.append(full_tests[i]['DP_diff'])

print("Test accuracy (avg): {}".format(sum(tmp_acc)/len(tmp_acc)))
print("DP disparity  (avg): {}".format(sum(tmp_dp)/len(tmp_dp)))

Test accuracy (avg): 0.7196000337600708
DP disparity  (avg): 0.00575151515151513
