In [31]:
import pandas as pd
import joblib
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [32]:
model_reg = joblib.load('all_features_xgb.joblib')

In [79]:
data = pd.read_csv('data/all_features.csv', index_col=[0,1])

In [80]:
data[["statistical_parity", "equal_opportunity", "average_odds"]] = data[
    ["statistical_parity", "equal_opportunity", "average_odds"]
].abs()

In [81]:
train, test = train_test_split(data, test_size=0.2, shuffle=False)

In [82]:
model_reg.fit(train.drop(columns=['statistical_parity', 'equal_opportunity', 'average_odds']).values, train[['statistical_parity', 'equal_opportunity', 'average_odds']].values)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.2, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [84]:
test = test[test.index.get_level_values(1) != 'german']

In [85]:
predictions = model_reg.predict(test.drop(columns=['statistical_parity', 'equal_opportunity', 'average_odds']).values)

In [86]:
true_val = test[['statistical_parity', 'equal_opportunity', 'average_odds']]

In [87]:
true_val.loc[:,'stat_par_pred'] = predictions[:, 0]
true_val.loc[:,'eq_opp_pred'] = predictions[:, 1]
true_val.loc[:,'avg_odds_pred'] = predictions[:, 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [88]:
true_val.index.get_level_values(1).unique()

Index(['health', 'hearth', 'law', 'medical', 'obesity', 'park', 'resyduo',
       'student', 'wine'],
      dtype='object', name='data')

In [98]:
true_val[true_val.index.get_level_values(1) == 'law'].iloc[:4]

Unnamed: 0_level_0,Unnamed: 1_level_0,statistical_parity,equal_opportunity,average_odds,stat_par_pred,eq_opp_pred,avg_odds_pred
variable,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
race,law,0.295092,0.166591,0.534723,0.236775,0.083782,0.478464
gender,law,0.270576,0.014035,0.506689,0.212589,0.102538,0.219868
parttime,law,0.332319,0.075809,0.709497,0.162121,0.089413,0.391281
pass_bar,law,0.349575,0.204255,0.732304,0.284753,0.153597,0.424489


In [89]:
r2_score(true_val['statistical_parity'], true_val['stat_par_pred'])

0.6349051626154033

In [90]:
r2_score(true_val['equal_opportunity'], true_val['eq_opp_pred'])

0.15499645238966775

In [91]:
r2_score(true_val['average_odds'], true_val['avg_odds_pred'])

0.22822383541575042

## Change the problem into classification with threshold

In [99]:
model = XGBClassifier()
logreg = LogisticRegression()

In [100]:
train.loc[train["statistical_parity"] > 0.2, "statistical_parity"] = 1
train.loc[train["statistical_parity"] != 1, "statistical_parity"] = 0
train.loc[train["equal_opportunity"] > 0.2, "equal_opportunity"] = 1
train.loc[train["equal_opportunity"] != 1, "equal_opportunity"] = 0
train.loc[train["average_odds"] > 0.2, "average_odds"] = 1
train.loc[train["average_odds"] != 1, "average_odds"] = 0

In [101]:
test.loc[test["statistical_parity"] > 0.2, "statistical_parity"] = 1
test.loc[test["statistical_parity"] != 1, "statistical_parity"] = 0
test.loc[test["equal_opportunity"] > 0.2, "equal_opportunity"] = 1
test.loc[test["equal_opportunity"] != 1, "equal_opportunity"] = 0
test.loc[test["average_odds"] > 0.2, "average_odds"] = 1
test.loc[test["average_odds"] != 1, "average_odds"] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [102]:
test = test[test.index.get_level_values(1) != 'german']

In [103]:
model.fit(train.drop(columns=['statistical_parity', 'equal_opportunity', 'average_odds']).values, train[['statistical_parity', 'equal_opportunity', 'average_odds']].values)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [104]:
predictions = model.predict(
    test.drop(
        columns=["statistical_parity", "equal_opportunity", "average_odds"]
    ).values
)

In [105]:
true_val = test[["statistical_parity", "equal_opportunity", "average_odds"]]
true_val.loc[:, "stat_par_pred"] = predictions[:, 0]
true_val.loc[:, "eq_opp_pred"] = predictions[:, 1]
true_val.loc[:, "avg_odds_pred"] = predictions[:, 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [106]:
from sklearn.metrics import accuracy_score

accuracy_score(true_val["statistical_parity"], true_val["stat_par_pred"])

0.9290681502086231

In [107]:
accuracy_score(true_val["equal_opportunity"], true_val["eq_opp_pred"])

0.8303198887343533

In [108]:
accuracy_score(true_val["average_odds"], true_val["avg_odds_pred"])

0.5910987482614742

In [92]:
true_val.index.get_level_values(1).unique()

Index(['health', 'hearth', 'law', 'medical', 'obesity', 'park', 'resyduo',
       'student', 'wine'],
      dtype='object', name='data')

In [113]:
true_val['stat_par_pred'].value_counts()

0.0    513
1.0    206
Name: stat_par_pred, dtype: int64

In [109]:
true_val[true_val.index.get_level_values(1) == "law"].iloc[:4]

Unnamed: 0_level_0,Unnamed: 1_level_0,statistical_parity,equal_opportunity,average_odds,stat_par_pred,eq_opp_pred,avg_odds_pred
variable,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
race,law,1.0,0.0,1.0,1.0,0.0,1.0
gender,law,1.0,0.0,1.0,1.0,0.0,0.0
parttime,law,1.0,0.0,1.0,0.0,0.0,1.0
pass_bar,law,1.0,1.0,1.0,1.0,0.0,1.0
