# Accuracy vs. Hamming Loss Trials

This notebook goes over sample code for testing the difference between optimizing for accuracy vs hamming loss. Note this notebook used pre-mined rules (included in the rules folder), and our pre-split and binerized data (included in split_data folder).

In [None]:
import pandas as pd
import numpy as np
import time
from master_model import *
from DNFRuleModel import DNFRuleModel
from scipy.stats import bernoulli
from fairness_modules import *
import seaborn as sns

### Experiments

In [None]:
#Helper functions for computing fairness/accuracy
def compute_TPR_GAP(preds, Y, group):
    res = {}
    res['TPR'] = sum(preds[Y])/len(preds[Y])
    res['TPR_1'] = sum(preds[Y & group])/len(preds[Y & group])
    res['TPR_2'] = sum(preds[Y & ~group])/len(preds[Y & ~group])
    res['TPR_GAP'] = abs(res['TPR_1'] - res['TPR_2'])
    return res

def compute_TNR_GAP(preds, Y, group):
    res = {}
    res['TNR'] = sum(~preds[~Y])/len(preds[~Y])
    res['TNR_1'] = sum(~preds[~Y & group])/len(preds[~Y & group])
    res['TNR_2'] = sum(~preds[~Y & ~group])/len(preds[~Y & ~group])
    res['TNR_GAP'] = abs(res['TNR_1'] - res['TNR_2'])
    return res

def compute_ACC_GAP(preds, Y, group):
    res = {}
    res['ACC'] = sum(preds == Y)/len(Y)
    res['ACC_1'] = sum(preds[group] == Y[group])/len(Y[group])
    res['ACC_2'] = sum(preds[~group] == Y[~group])/len(Y[~group])
    res['ACC_GAP'] = abs(res['ACC_1'] - res['ACC_2'])
    return res
    
def compute_EqOpp(preds, Y, group):
    return compute_TPR_GAP(preds, Y, group)

def compute_EqOd(preds, Y, group):
    return compute_TPR_GAP(preds, Y, group).update(compute_TNR_GAP(preds, Y, group) )

def compute_AccDisp(preds, Y, group):
    return compute_ACC_GAP(preds, Y, group)

def compute_fairness(preds,Y,group):
    res = compute_TPR_GAP(preds, Y, group)
    res.update(compute_TNR_GAP(preds, Y, group))
    res.update(compute_ACC_GAP(preds,Y,group))
    return res

In [None]:
#Helpre function for evaluating diffrent IP Models
def eval_model(model, ruleMod, rules, X_tr, Y_tr, X_tst, Y_tst):
    #Initialize model with rule sest
    model.addRule(rules)
    
    #Solve the mod
    start = time.perf_counter()
    results = model.solve(verbose = True, relax = False)
    end = time.perf_counter() - start
    
    fitRules = results['ruleSet']
    
    if len(fitRules) > 0:
        preds_tr = ruleMod.predict(X_tr, fitRules)
        preds = ruleMod.predict(X_tst, fitRules)
    else:
        preds_tr = (np.zeros(Y_tr.shape)).astype(np.bool)
        preds = (np.zeros(Y_tst.shape)).astype(np.bool)
        
    tr_acc = np.mean(preds_tr == Y_tr)
    tst_acc = np.mean(preds == Y_tst)
    complexity = len(fitRules) + np.sum(fitRules)
    
    res = {'time': end,
           'tr_acc': tr_acc,
           'tst_acc': tst_acc,
           'complexity': complexity,
           'obj': results['obj']
          }
    return res, preds

In [None]:
results = []
test_params = {
            'price_limit': 45,
            'train_limit': 300,
            'fixed_model_params': {
                'ruleGenerator': 'Hybrid',
                'masterSolver':'barrierCrossover',
                'numRulesToReturn': 100,
                'fairness_module': 'EqOfOp',
                'IP_time_limit': 600
            },
        }

protected_features = {'compas': 'race', 
                      'adult': 'gender',
                      'default': 'X2'
                     }
group_var = 'race'

for i in range(6):
    print('***** FOLD %d ******'%i)
    train  = pd.read_csv('data_split/bin_'+'compas'+'_train_%d.csv'%i)
    test = pd.read_csv('data_split/bin_'+'compas'+'_test_%d.csv'%i)
    X_tr = train.drop('Y',axis=1).to_numpy()
    Y_tr = train['Y'].to_numpy()
    X_tst = test.drop('Y',axis=1).to_numpy()
    Y_tst = test['Y'].to_numpy()
    
    isFirst = True
    for ruleset in ['faircg','rf']:
        print('***** Ruleset %s ******'%ruleset)
        new_rules = np.load('rules/'+ruleset+'_%s_fold_%d.npy'%('compas', i)).astype(int)
        if isFirst:
            rules = new_rules
            isFirst = False
        else:
            rules = np.concatenate([rules,new_rules])
    
    for eps in [0.01, 0.05, 0.1, 0.2, 1]:
        for C in [30]:
            print('***** EPS %f ******'%eps)

            test_params = test_params.copy()
            test_params['fixed_model_params']['epsilon'] = eps
            test_params['fixed_model_params']['ruleComplexity'] = C
            test_params['fixed_model_params']['group'] = train[group_var].to_numpy()

            ruleMod = DNFRuleModel(X_tr, Y_tr)
            fairMod = EqualityOfOpportunity.EqualityOfOpportunity(test_params['fixed_model_params'])

            res,preds = eval_model(CompactDoubleSidedMaster.CompactDoubleSidedMaster(ruleMod, fairMod, 
                                                                                     test_params['fixed_model_params']),
                                   ruleMod, 
                                   rules, 
                                   train.drop('Y',axis=1).to_numpy(), train['Y'].to_numpy(),
                                   test.drop('Y',axis=1).to_numpy(), test['Y'].to_numpy())

            res['dataset'] = 'compas'
            res['fold'] = i
            res['eps'] = eps
            res['fairMet'] = 'EqOp'
            res['C'] = C
            res['method'] = 'hamming loss'
            res.update(compute_fairness(preds,test['Y'],test[group_var]))
            results.append(res)

            ruleMod = DNFRuleModel(X_tr, Y_tr)
            res,preds = eval_model(ZeroOneDoubleSidedMaster.ZeroOneDoubleSidedMaster(ruleMod, fairMod, 
                                                                                     test_params['fixed_model_params']),
                                   ruleMod, 
                                   rules, 
                                   train.drop('Y',axis=1).to_numpy(), train['Y'].to_numpy(),
                                   test.drop('Y',axis=1).to_numpy(), test['Y'].to_numpy())
            


            res['dataset'] = 'compas'
            res['fold'] = i
            res['eps'] = eps
            res['fairMet'] = 'EqOp'
            res['C'] = C
            res['method'] = 'accuracy'
            res.update(compute_fairness(preds,test['Y'],test[group_var]))
            results.append(res)
            
results = pd.DataFrame.from_records(results)
results.to_csv('01_vs_Hamming_fair2.csv')

In [None]:
results = []
test_params = {
            'price_limit': 45,
            'train_limit': 300,
            'fixed_model_params': {
                'ruleGenerator': 'Hybrid',
                'masterSolver':'barrierCrossover',
                'numRulesToReturn': 100,
                'fairness_module': 'EqOfOp',
                'IP_time_limit': 600
            },
        }

protected_features = {'compas': 'race', 
                      'adult': 'gender',
                      'default': 'X2'
                     }
group_var = 'race'

for i in range(6):
    print('***** FOLD %d ******'%i)
    train  = pd.read_csv('data_split/bin_'+'compas'+'_train_%d.csv'%i)
    test = pd.read_csv('data_split/bin_'+'compas'+'_test_%d.csv'%i)
    X_tr = train.drop('Y',axis=1).to_numpy()
    Y_tr = train['Y'].to_numpy()
    X_tst = test.drop('Y',axis=1).to_numpy()
    Y_tst = test['Y'].to_numpy()
    
    isFirst = True
    for ruleset in ['faircg','rf']:
        print('***** Ruleset %s ******'%ruleset)
        new_rules = np.load('rules/'+ruleset+'_%s_fold_%d.npy'%('compas', i)).astype(int)
        if isFirst:
            rules = new_rules
            isFirst = False
        else:
            rules = np.concatenate([rules,new_rules])
    
    for eps in [0.025]:
        for C in [30]:
            print('***** EPS %f ******'%eps)

            test_params = test_params.copy()
            test_params['fixed_model_params']['epsilon'] = eps
            test_params['fixed_model_params']['ruleComplexity'] = C
            test_params['fixed_model_params']['group'] = train[group_var].to_numpy()

            ruleMod = DNFRuleModel(X_tr, Y_tr)
            fairMod = EqualityOfOpportunity.EqualityOfOpportunity(test_params['fixed_model_params'])

            res,preds = eval_model(CompactDoubleSidedMaster.CompactDoubleSidedMaster(ruleMod, fairMod, 
                                                                                     test_params['fixed_model_params']),
                                   ruleMod, 
                                   rules, 
                                   train.drop('Y',axis=1).to_numpy(), train['Y'].to_numpy(),
                                   test.drop('Y',axis=1).to_numpy(), test['Y'].to_numpy())

            res['dataset'] = 'compas'
            res['fold'] = i
            res['eps'] = eps
            res['fairMet'] = 'EqOp'
            res['C'] = C
            res['method'] = 'hamming loss'
            res.update(compute_fairness(preds,test['Y'],test[group_var]))
            results.append(res)

            ruleMod = DNFRuleModel(X_tr, Y_tr)
            res,preds = eval_model(ZeroOneDoubleSidedMaster.ZeroOneDoubleSidedMaster(ruleMod, fairMod, 
                                                                                     test_params['fixed_model_params']),
                                   ruleMod, 
                                   rules, 
                                   train.drop('Y',axis=1).to_numpy(), train['Y'].to_numpy(),
                                   test.drop('Y',axis=1).to_numpy(), test['Y'].to_numpy())
            


            res['dataset'] = 'compas'
            res['fold'] = i
            res['eps'] = eps
            res['fairMet'] = 'EqOp'
            res['C'] = C
            res['method'] = 'accuracy'
            res.update(compute_fairness(preds,test['Y'],test[group_var]))
            results.append(res)
            
results = pd.DataFrame.from_records(results)
results.to_csv('01_vs_Hamming_fair_feb3.csv')

# BONUS (OLD PLOTS)

In [None]:
agg_results = (results.groupby(['method','eps']).agg({'tr_acc': 'mean', 'tst_acc':'mean',
                                           'complexity': 'mean', 'time': 'mean'})
 .reset_index()
)

In [None]:
side_by_side = (agg_results.query('method == \'accuracy\'')
                .merge(agg_results.query('method == \'hamming loss\''),
                      on='eps')
                .assign(acc_diff = lambda df: df.tr_acc_x - df.tr_acc_y,
                        acc_diff_tst = lambda df: df.tst_acc_x - df.tst_acc_y,
                       )
               )

In [None]:
sns.relplot(data = agg_results, x = 'eps', y='tr_acc', hue = 'method', kind='line')

In [None]:
sns.relplot(data = agg_results, x = 'eps', y='tst_acc', hue = 'method', kind='line')

In [None]:
sns.relplot(data = agg_results, x = 'eps', y='time', hue = 'method', kind='line')

In [None]:
sns.relplot(data = side_by_side, x = 'eps', y='acc_diff', kind='line')

In [None]:
sns.relplot(data = side_by_side, x = 'eps', y='acc_diff_tst', kind='line')

## Integrality Gap Experiments

In [None]:
def eval_model2(method, model, ruleMod, rules, X_tr, Y_tr, X_tst, Y_tst, relax):
    model.addRule(rules)
    start = time.perf_counter()
    results = model.solve(verbose = True, relax = relax)
    end = time.perf_counter() - start
        
    res = {'model': method,
           'time': end,
           'obj': results['obj']
          }
    return res

In [None]:
num_features = 10
n = 2000
n_test = 500
num_rounds = 50
class_imbalances = np.linspace(0.01,0.99,10)

In [None]:
rules = generate_rules(num_features)
results = []
for imb in class_imbalances:
    print('**** CLASS IMBALANCE %f ****'%imb)
    for i in range(num_rounds):
        train_data, test_data = generate_data(n, num_features, imb, n_test)
        X_tr = train_data.drop('Y',axis=1).to_numpy()
        Y_tr = train_data['Y'].to_numpy()
        X_tst = test_data.drop('Y',axis=1).to_numpy()
        Y_tst = test_data['Y'].to_numpy()

        
        ruleMod = DNFRuleModel(X_tr, Y_tr)
        fairMod = NoFair.NoFair({})
        
        res = eval_model2('LP', 
                   CompactOneSidedMaster.CompactOneSidedMaster(ruleMod, fairMod, {'ruleComplexity': 20}),
                   ruleMod, 
                   rules, 
                   X_tr, Y_tr, X_tst, Y_tst, True)

        res['imbalance'] = imb
        res['trial'] = i

        results.append(res)
        
        ruleMod = DNFRuleModel(X_tr, Y_tr)
        res = eval_model2('IP', 
                   CompactOneSidedMaster.CompactOneSidedMaster(ruleMod, fairMod, {'ruleComplexity': 20}),
                   ruleMod, 
                   rules, 
                   X_tr, Y_tr, X_tst, Y_tst, False)

        res['imbalance'] = imb
        res['trial'] = i
        results.append(res)
        
results = pd.DataFrame.from_records(results)
results.to_csv('integrality_gap.csv')

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
results = pd.read_csv('integrality_gap.csv')

In [None]:
gap = (results.query('model == \'LP\'')
 .merge(results.query('model == \'IP\''), on=['trial','imbalance'])
 .assign(integrality_gap = lambda df: df.obj_y/df.obj_x)
)

imb_init = (gap
 .groupby('imbalance')
 ['integrality_gap']
 .mean()
 .reset_index()
)

In [None]:
sns.distplot(gap['integrality_gap'])

In [None]:
sns.relplot(data = imb_int, x = 'imbalance', y='integrality_gap',kind='line')