In [1]:
import numpy as np
import pandas as pd
from collections import Counter

########################SET folder to path of code directory in the project folder#####################
## SET PATH to code directory in the project folder
code_path = "C:\\evdoxiataka\\effi_user_study1\\code"
import sys  
sys.path.insert(1, code_path)

########################SET folder to project directory path#####################
folder= "C:\\evdoxiataka\\effi_user_study1\\"

from utils.data_loading import get_all_participants_logs, get_all_participants_feedback
from utils.utils import attributes_names_mapping, manipulate_categ_values, binning
from utils.training import oneoff_training_evaluation, iml_training_evaluation

import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

sensitive_attrs = list(attributes_names_mapping.keys())
fs = 0.5

## Data loading

In [2]:
## load training, validation and test sets
train_df_train = pd.read_csv(folder+'data/processed_data/train_df_train.csv', delimiter=',')
train_df_test = pd.read_csv(folder+'data/processed_data/train_df_test.csv', delimiter=',')
test_df = pd.read_csv(folder+'data/processed_data/test_df_final.csv', delimiter=',')

X_train_original = train_df_train.loc[:, ~train_df_train.columns.isin(['TARGET', 'SK_ID_CURR'])]
y_train_original = train_df_train.loc[:, train_df_train.columns == 'TARGET']
X_test_original = train_df_test.loc[:, ~train_df_test.columns.isin(['TARGET', 'SK_ID_CURR'])] ## dataset used for accuracy
y_test_original = train_df_test.loc[:, train_df_test.columns == 'TARGET']

## attributes used in training in correct order
training_attr_in_order = X_train_original.columns.tolist()
training_attr_descriptive_in_order = [attributes_names_mapping[attr[:-3]] if attr.endswith('_LE') else attributes_names_mapping[attr] for attr in training_attr_in_order]

## load original test set to be used for fairness metrics
train_df = pd.read_csv(folder+'data/processed_data/train_df.csv', delimiter=',')
test_df_app_ids = train_df_test['SK_ID_CURR'].tolist()
train_df_test_or = train_df[train_df['SK_ID_CURR'].isin(test_df_app_ids)]
train_df_test_or = train_df_test_or.reset_index().set_index('SK_ID_CURR').loc[test_df_app_ids].reset_index().set_index('index').rename_axis(None)
manipulate_categ_values(train_df_test_or)
train_df_test_bin = binning(train_df_test_or, train_df_test)

## load applications' data shown to participants through the UI
applications_df = pd.read_csv(folder+"data/processed_data/Applications_Converted.csv", index_col=0)

## LOAD FEEDBACK
prolific_export_filePath = folder+"data\\collected_data\\prolific_export_demographics.csv"
interaction_logs_filePath = folder+"data\\collected_data\\LOGS\\"
logs_df = get_all_participants_logs(prolific_export_filePath, interaction_logs_filePath)
feedback_df = get_all_participants_feedback(logs_df)
feedback_df.drop(columns = "Lower bound",inplace = True)
feedback_df.drop(columns = "Upper bound",inplace = True)

## set new proposed distribution of feature weights in Value column
## get initial distribution
feature_weights_dict = {}
for i in range(49):
    feature_weights_dict[applications_df['ft_name_'+str(i)].tolist()[0]] = applications_df['ft_weight_'+str(i)].tolist()[0]
feature_weights = []
for feat in training_attr_descriptive_in_order:
    feature_weights.append(feature_weights_dict[feat])
## calculate new distribution
all_new_values = []
pred_decision = []
for i in feedback_df.index :
    app_id = feedback_df['App ID'].loc[i]
    p_id = feedback_df['ID'].loc[i]
    value = feedback_df['Value'].loc[i]
    ## PREDICTED DECISION
    pred_decision.append(applications_df[applications_df["Application_id"] == app_id]["Predicted_decision"].tolist()[0])
    ##
    new_value = feature_weights.copy()
    if value:
        for v_dict in value:
            attr = v_dict['attribute']
            new_value[training_attr_descriptive_in_order.index(attr)]=np.float64(v_dict['value'])
    else:
        new_value = []
    ## normalize
    new_value = [v/sum(new_value) for v in new_value]
    # if sum(new_value) >1.0:
    #     print(sum(new_value), max(new_value),min(new_value))
    all_new_values.append(new_value)    
feedback_df['Value'] = all_new_values
feedback_df['PredictedDecision'] = pred_decision

## FEEDBACK without duplicate instances of same application per participant
feedback_df_no_app_dupl_per_part = pd.DataFrame(feedback_df)
for p_id in feedback_df_no_app_dupl_per_part['ID'].unique().tolist():
    p_id_feedbck_df = feedback_df_no_app_dupl_per_part[feedback_df_no_app_dupl_per_part['ID']==p_id]    
    apps_counts = dict(Counter(p_id_feedbck_df['App ID']))
    for app,v in apps_counts.items():
        if v>1:
            idxs_to_drop = p_id_feedbck_df[p_id_feedbck_df['App ID']==app].index.tolist()[:-1]
            feedback_df_no_app_dupl_per_part.drop(idxs_to_drop,inplace=True)

## TRAINING

### Global Model - Labels ('fair'+'unfair')
The XGboost classifier is retrained. The feedback instances of all participants are added into the training set all at once.

**All the 'fair' and 'unfair'** labelled instances by participants are used. 

We set fs = 0.5.

In [3]:
oneoff_training_evaluation(X_train_original, y_train_original,
                            X_test_original, y_test_original,
                           train_df_test_bin, test_df,
                            sensitive_attrs, fs, feedback_df,
                           False, False,
                           folder+"data/results/global/Labels/", 'global-Labels')

participant 0
predictedDecision Accepted
feed_label fair


### Global Model - Labels ('unfair')
The XGboost classifier is retrained. The feedback instances of all participants are added into the training set all at once.

**Only the 'unfair'** labelled instances by participants are used. 

We set fs = 0.5.

In [None]:
oneoff_training_evaluation(X_train_original, y_train_original,
                            X_test_original, y_test_original,
                           train_df_test_bin, test_df,
                            sensitive_attrs, fs, feedback_df,
                           True, False,
                           folder+"data/results/global/Labels_Unfair/", 'global-Labels_Unfair')

### Global Model - Labels+Weights ('fair'+'unfair')
The XGboost classifier is retrained. The feedback instances of all participants are added into the training set all at once.

**All the 'fair' and 'unfair'** labelled instances by participants **with proposed weight changes** are used. 

We set fs = 0.5.

In [None]:
oneoff_training_evaluation(X_train_original, y_train_original,
                            X_test_original, y_test_original,
                           train_df_test_bin, test_df,
                            sensitive_attrs, fs, feedback_df,
                           False, True, 
                           folder+"data/results/global/Labels+Weights/", 'global-Labels+Weights')

### Personalized Models - Labels ('fair'+'unfair')
58 XGboost classifiers will be trained, one for each participant independently. The feedback instances of each participant will be added to the training set incrementally, one at a time, in increasing timestamps (the previous feedback instances of a participant remain in the training set as we add a new one). The XGboost classifier is retrained from scratch in every incremental step (iteration). 

**All the 'fair' and 'unfair'** labelled instances by participants are used. 

We set fs = 0.5.

In [None]:
iml_training_evaluation(X_train_original, y_train_original,
                        X_test_original, y_test_original,
                        train_df_test_bin, test_df,
                        sensitive_attrs, fs, feedback_df,
                        False, False, 
                        folder+"data/results/personalized/Labels/", 'personalized-Labels')

### Personalized Models - Labels ('unfair')
58 XGboost classifiers will be trained, one for each participant independently. The feedback instances of each participant will be added to the training set incrementally, one at a time, in increasing timestamps (the previous feedback instances of a participant remain in the training set as we add a new one). The XGboost classifier is retrained from scratch in every incremental step (iteration). 

**Only the 'unfair'** labelled instances by participants are used.

We set fs = 0.5.

In [None]:
iml_training_evaluation(X_train_original, y_train_original,
                        X_test_original, y_test_original,
                        train_df_test_bin, test_df,
                        sensitive_attrs, fs, feedback_df,
                        True, False, 
                        folder+"data/results/personalized/Labels_Unfair/", 'personalized-Labels_Unfair')

### Personalized Models - Labels+Weights ('fair'+'unfair')
The feedback instances of each participant will be added to the training set incrementally, one at a time, in increasing timestamps (the previous feedback instances of a participant remain in the training set as we add a new one). The XGboost classifier is retrained from scratch in every incremental step (iteration).

**All the 'fair' and 'unfair'** labelled instances with proposed weight changes are used. Participants weights of each instance are used to set feature weights of the training.

We set fs = 0.5.

In [None]:
iml_training_evaluation(X_train_original, y_train_original,
                        X_test_original, y_test_original,
                        train_df_test_bin, test_df,
                        sensitive_attrs, fs, feedback_df,
                        False, True, 
                        folder+"data/results/personalized/Labels+Weights/", 'personalized-Labels+Weights')