In [None]:
import pandas as pd
import numpy as np

########################SET folder to project directory path#####################
folder=""

## Loading Training - Test Sets

In [None]:
test_df = pd.read_csv(folder+'data/processed_data/Model_Predictions.csv', delimiter=',')
predictions_test = test_df.loc[:, test_df.columns == 'Predicted_Result']
predictions_test.replace({'Predicted_Result' : { 'Accepted':1, 'Rejected':0}},inplace=True)
predictions_test = predictions_test['Predicted_Result'].tolist()

test_df_final = pd.read_csv(folder+'data/processed_data/test_df_final.csv', delimiter=',')
ft_df = pd.read_csv(folder+'data/processed_data/FeatureImportance.csv', delimiter=',')

## Preparation of data (1000 applications) that will be shown to participants through the UI

The applications in test set (test_df) constitute the data that will be shown to participants in user study 1.

The predicted label, prediction confidence, the importance weight and indegree and outdegree of each variable (feature) for each application are included here to the data that will be shown to participants.

In [None]:
## Some categorical data manipulation: numerical values to categorical
test_df.replace({'NAME_CONTRACT_TYPE' : { 'Cash loans' :'Fixed', 'Revolving loans' : 'Not Fixed'}},inplace=True)

test_df.replace({'FLAG_OWN_CAR' : { "N" :'No', "Y" : 'Yes'}},inplace=True)

test_df.replace({'FLAG_OWN_REALTY' : { "N" :'No', "Y" : 'Yes'}},inplace=True)


test_df.replace({'FLAG_MOBIL' : { 0 :'No', 1 : 'Yes'}},inplace=True)

test_df.replace({'FLAG_EMP_PHONE' : { 0 :'No', 1 : 'Yes'}},inplace=True)

test_df.replace({'FLAG_WORK_PHONE' : { 0 :'No', 1 : 'Yes'}},inplace=True)

test_df.replace({'FLAG_CONT_MOBILE' : { 0 :'No', 1 : 'Yes'}},inplace=True)

test_df.replace({'FLAG_PHONE' : { 0 :'No', 1 : 'Yes'}},inplace=True)

test_df.replace({'FLAG_EMAIL' : { 0 :'No', 1 : 'Yes'}},inplace=True)


test_df.replace({'REG_REGION_NOT_LIVE_REGION' : { 0 :'Same', 1 : 'Different'}},inplace=True)

test_df.replace({'REG_REGION_NOT_WORK_REGION' : { 0 :'Same', 1 : 'Different'}},inplace=True)

test_df.replace({'LIVE_REGION_NOT_WORK_REGION' : { 0 :'Same', 1 : 'Different'}},inplace=True)

test_df.replace({'REG_CITY_NOT_LIVE_CITY' : { 0 :'Same', 1 : 'Different'}},inplace=True)

test_df.replace({'REG_CITY_NOT_WORK_CITY' : { 0 :'Same', 1 : 'Different'}},inplace=True)

test_df.replace({'LIVE_CITY_NOT_WORK_CITY' : { 0 :'Same', 1 : 'Different'}},inplace=True)

test_df.replace({'CODE_GENDER' : { "F" :'Female', "M" :'Male'}},inplace=True)

test_df.replace({'YEARS_EMPLOYED' : { -1001 :np.nan}},inplace=True)

test_df

In [None]:
test_df.to_csv(folder+"data/processed_data/Model_Predictions_Converted.csv", index=False)

### Some checks

In [None]:
## some checks
df = test_df.select_dtypes(exclude=["number"])
for col in df:
    print(df[col].unique())

In [None]:
test_df

In [None]:
categorical_features = test_df.iloc[:,4:].select_dtypes(exclude=["number"]).columns
categorical_features

In [None]:
test_df.iloc[:,4:].select_dtypes(include=["number"])

In [None]:
numerical_features = test_df.iloc[:,4:].select_dtypes(include=["number"]).columns
# numerical_features

In [None]:
test_df.iloc[:,4:].columns

### Fairness Metrics (Demographic Parity - Consistency)
Binning is used for the calculation of the fairness metrics.

Fairlearn library is used.

The fairness metrics are calculated per feature.

In [None]:
dp_ratio_features = {}

#### 1. NAME_CONTRACT_TYPE

In [None]:
test_df["NAME_CONTRACT_TYPE"].unique()

In [None]:
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate

sr_contract_type = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["NAME_CONTRACT_TYPE"])
# display(sr.overall)
display(sr_contract_type.by_group)

In [None]:
contract_type_risk_groups= {"Fixed " : round(100*sr_contract_type.by_group.loc["Fixed"]), 
                           "Not Fixed " : round(100*sr_contract_type.by_group.loc["Not Fixed"])}

contract_type_risk_groups

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(contract_type_risk_groups.keys(),contract_type_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Selection Rate vs NAME_CONTRACT_TYPE")
plt.xlabel('NAME_CONTRACT_TYPE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for NAME_CONTRACT_TYPE is ", max(contract_type_risk_groups.values()) - min(contract_type_risk_groups.values()))

contract_type_dp = round(min(contract_type_risk_groups.values()) / max(contract_type_risk_groups.values()),2)
print("The dp ratio for NAME_CONTRACT_TYPE is ", contract_type_dp)

dp_ratio_features.update({"NAME_CONTRACT_TYPE": contract_type_dp})

In [None]:
dp_ratio_features

#### 2. CODE_GENDER

In [None]:
test_df["CODE_GENDER"].unique()

In [None]:
sr_gender = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["CODE_GENDER"])
# display(sr.overall)
display(sr_gender.by_group)

In [None]:
gender_risk_groups= {"Female " : round(100*sr_gender.by_group.loc["Female"]), 
                           "Male" : round(100*sr_gender.by_group.loc["Male"])}

gender_risk_groups

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(gender_risk_groups.keys(),gender_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs CODE_GENDER")
plt.xlabel('CODE_GENDER')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for CODE_GENDER is ", max(gender_risk_groups.values()) - min(gender_risk_groups.values()))

gender_dp = round(min(gender_risk_groups.values()) / max(gender_risk_groups.values()),2)
print("The dp ratio for CODE_GENDER is ", gender_dp)

dp_ratio_features.update({"CODE_GENDER": gender_dp})

In [None]:
dp_ratio_features

#### 3. FLAG_OWN_CAR

In [None]:
test_df["FLAG_OWN_CAR"].unique()

In [None]:
sr_car = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["FLAG_OWN_CAR"])
# display(sr.overall)
display(sr_car.by_group)

In [None]:
car_risk_groups= {"No" : round(100*sr_car.by_group.loc["No"]), 
                           "Yes" : round(100*sr_car.by_group.loc["Yes"])}

car_risk_groups

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(car_risk_groups.keys(),car_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs FLAG_OWN_CAR")
plt.xlabel('FLAG_OWN_CAR')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for FLAG_OWN_CAR is ", max(car_risk_groups.values()) - min(car_risk_groups.values()))

car_dp = round(min(car_risk_groups.values()) / max(car_risk_groups.values()),2)
print("The dp ratio for FLAG_OWN_CAR is ", car_dp)

dp_ratio_features.update({"FLAG_OWN_CAR": car_dp})

In [None]:
dp_ratio_features

####  4. FLAG_OWN_REALTY

In [None]:
test_df["FLAG_OWN_REALTY"].unique()

In [None]:
sr_realty = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["FLAG_OWN_REALTY"])
# display(sr.overall)
display(sr_realty.by_group)

In [None]:
realty_risk_groups= {"No" : round(100*sr_realty.by_group.loc["No"]), 
                           "Yes" : round(100*sr_realty.by_group.loc["Yes"])}

realty_risk_groups

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(realty_risk_groups.keys(),realty_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs FLAG_OWN_REALTY")
plt.xlabel('FLAG_OWN_REALTY')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for FLAG_OWN_REALTY is ", max(realty_risk_groups.values()) - min(realty_risk_groups.values()))

realty_dp = round(min(realty_risk_groups.values()) / max(realty_risk_groups.values()),2)
print("The dp ratio for FLAG_OWN_REALTY is ", realty_dp)

dp_ratio_features.update({"FLAG_OWN_REALTY": realty_dp})

In [None]:
dp_ratio_features

#### 5. CNT_CHILDREN

Each Selection

In [None]:
sr_children = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["CNT_CHILDREN"])
# display(sr.overall)
display(sr_children.by_group)

In [None]:
less_than_2 = sr_children.by_group.loc[0:2]
three_to_five = sr_children.by_group.loc[3:5]
five_to_eight = sr_children.by_group.loc[5:8]

less_than_2_risk_rate = round(np.mean(less_than_2),2)
three_to_five_risk_rate = round(np.mean(three_to_five),2)
five_to_eight_risk_rate = round(np.mean(five_to_eight),2)


cnt_children_risk_groups= {"0-2" : 100*less_than_2_risk_rate, "3-5" : round(100*three_to_five_risk_rate), 
                  "5-8": 100*five_to_eight_risk_rate}

cnt_children_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(cnt_children_risk_groups.keys(),cnt_children_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs Count Children")
plt.xlabel('Count Children')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for Count Children is ", max(cnt_children_risk_groups.values()) - min(cnt_children_risk_groups.values()))

cnt_children_dp = round(min(cnt_children_risk_groups.values()) / max(cnt_children_risk_groups.values()),2)
print("The dp ratio for Count Children is ", cnt_children_dp)

dp_ratio_features.update({"CNT_CHILDREN": cnt_children_dp})

In [None]:
dp_ratio_features

#### 6. AMT_INCOME_TOTAL

In [None]:
test_df["AMT_INCOME_TOTAL"].min(), test_df["AMT_INCOME_TOTAL"].max()

In [None]:
# <100,000
# 100,000 - 150,000
# 150,000 - 200,000
# 200,000 - 250,000
# >250,000

In [None]:
sr_income = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["AMT_INCOME_TOTAL"])
# display(sr.overall)
display(sr_income.by_group)

In [None]:
less_100k = sr_income.by_group.loc[:100000]
btw_100k_150k = sr_income.by_group.loc[100000:150000]
btw_150k_200k = sr_income.by_group.loc[150000:200000]
btw_200k_250k = sr_income.by_group.loc[200000:250000]
gt_250k = sr_income.by_group.loc[250000:]


# display(len(less_100k))
# display(len(btw_100k_150k))
# display(len(btw_150k_200k))
# display(len(btw_200k_250k))
# display(len(gt_250k))


less_100k_risk_rate = 100*round(np.mean(less_100k),2)
btw_100k_150k_risk_rate = 100*round(np.mean(btw_100k_150k),2)
btw_150k_200k_risk_rate = 100*round(np.mean(btw_150k_200k),2)
btw_200k_250k_risk_rate = 100*round(np.mean(btw_200k_250k),2)
gt_250k_risk_rate = 100*round(np.mean(gt_250k),2)


income_risk_groups= {"<100K" : round(less_100k_risk_rate), "100K-150K" : btw_100k_150k_risk_rate, 
                     "150K-200K" :  btw_150k_200k_risk_rate, "200K-250K": btw_200k_250k_risk_rate, ">250K": gt_250k_risk_rate}

income_risk_groups

In [None]:

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(income_risk_groups.keys(),income_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs Income")
plt.xlabel('Income')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for Amount Income is ", max(income_risk_groups.values()) - min(income_risk_groups.values()))

income_dp = round(min(income_risk_groups.values()) / max(income_risk_groups.values()),2)
print("The dp ratio for Count Children is ", income_dp)

dp_ratio_features.update({"AMT_INCOME_TOTAL": income_dp})

In [None]:
dp_ratio_features

#### 7. AMT_CREDIT

In [None]:
test_df["AMT_CREDIT"].min(), test_df["AMT_CREDIT"].max()

In [None]:
# <250,000
# 250,000 - 500,000
# 500,000 - 750,000
# 750,000 - 1,000,000
# >1,000,000 

In [None]:
sr_credit = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["AMT_CREDIT"])
# display(sr.overall)
display(sr_credit.by_group)

In [None]:
len(sr_credit.by_group)

In [None]:
less_250k = sr_credit.by_group.loc[:250000]
btw_250k_500k = sr_credit.by_group.loc[250000:500000]
btw_500k_750k = sr_credit.by_group.loc[500000:750000]
btw_750k_1m = sr_credit.by_group.loc[750000:1000000]
gt_1m = sr_credit.by_group.loc[1000000:]



# display(len(less_250k))
# display(len(btw_250k_500k))
# display(len(btw_500k_750k))
# display(len(btw_750k_1m))
# display(len(gt_1m))



less_250k_risk_rate = 100*round(np.mean(less_250k),2)
btw_250k_500k_risk_rate = 100*round(np.mean(btw_250k_500k),2)
btw_500k_750k_risk_rate = 100*round(np.mean(btw_500k_750k),2)
btw_750k_1m_risk_rate = 100*round(np.mean(btw_750k_1m),2)
gt_1m_risk_rate = 100*round(np.mean(gt_1m),2)


credit_risk_groups= {"<250K" : less_250k_risk_rate, "250K-500K" : btw_250k_500k_risk_rate, 
                     "500K-750K" :  btw_500k_750k_risk_rate, "750K-1M": btw_750k_1m_risk_rate, ">1M": round(gt_1m_risk_rate)}

credit_risk_groups

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(credit_risk_groups.keys(),credit_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs Credit")
plt.xlabel('Credit')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for Credit is ", max(credit_risk_groups.values()) - min(credit_risk_groups.values()))

credit_dp = round(min(credit_risk_groups.values()) / max(credit_risk_groups.values()),2)
print("The dp ratio for Credit is ", credit_dp)

dp_ratio_features.update({"AMT_CREDIT":credit_dp})

In [None]:
dp_ratio_features

#### 8. 'AMT_ANNUITY'

In [None]:
test_df["AMT_ANNUITY"].min(), test_df["AMT_ANNUITY"].max()

In [None]:
sr_annunity = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["AMT_ANNUITY"])
# display(sr.overall)
display(sr_annunity.by_group)

In [None]:
less_10k = sr_annunity.by_group.loc[:10000]
btw_10k_25k = sr_annunity.by_group.loc[10000:25000]
btw_25k_50k = sr_annunity.by_group.loc[25000:50000]
gt_50k = sr_annunity.by_group.loc[50000:]



# display(len(less_10k))
# display(len(btw_10k_25k))
# display(len(btw_25k_50k))
# display(len(gt_50k))



less_10k_risk_rate = 100*round(np.mean(less_10k),2)
btw_10k_25k_risk_rate = 100*round(np.mean(btw_10k_25k),2)
btw_25k_50k_risk_rate = 100*round(np.mean(btw_25k_50k),2)
gt_50k_risk_rate = 100*round(np.mean(gt_50k),2)


annunity_risk_groups= {"<10K" : less_10k_risk_rate, "10K-25K" : btw_10k_25k_risk_rate, 
                     "25K-50K" :  btw_25k_50k_risk_rate, ">50K": gt_50k_risk_rate}

annunity_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(annunity_risk_groups.keys(),annunity_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs Annunity")
plt.xlabel('Annunity')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for Annunity is ", max(annunity_risk_groups.values()) - min(annunity_risk_groups.values()))

annunity_dp = round(min(annunity_risk_groups.values()) / max(annunity_risk_groups.values()),2)
print("The dp ratio for Annunity is ", annunity_dp)

dp_ratio_features.update({"AMT_ANNUITY":annunity_dp})

In [None]:
dp_ratio_features

#### 9. 'AMT_GOODS_PRICE'

In [None]:
test_df["AMT_GOODS_PRICE"].min(), test_df["AMT_GOODS_PRICE"].max()

In [None]:
# <100,000
# 100,000 - 500,000
# 500,000 - 1,000,000
# >1,000,000

In [None]:
sr_goods = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["AMT_GOODS_PRICE"])
# display(sr.overall)
display(sr_goods.by_group)

In [None]:
less_100k = sr_goods.by_group.loc[:100000]
btw_100k_500k = sr_goods.by_group.loc[100000:500000]
btw_500k_1m = sr_goods.by_group.loc[500000:1000000]
gt_1m = sr_goods.by_group.loc[1000000:]



# display(len(less_100k))
# display(len(btw_100k_500k))
# display(len(btw_500k_1m))
# display(len(gt_1m))



less_100k_risk_rate = 100*round(np.mean(less_100k),2)
btw_100k_500k_risk_rate = 100*round(np.mean(btw_100k_500k),2)
btw_500k_1m_risk_rate = 100*round(np.mean(btw_500k_1m),2)
gt_1m_risk_rate = 100*round(np.mean(gt_1m),2)


goods_risk_groups= {"<100K" : less_100k_risk_rate, "100K-500K" : btw_100k_500k_risk_rate, 
                     "500K-1M" :  btw_500k_1m_risk_rate, ">1M": gt_1m_risk_rate}

goods_risk_groups

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(goods_risk_groups.keys(),goods_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs Goods Price")
plt.xlabel('Goods Price')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for Goods Price is ", max(goods_risk_groups.values()) - min(goods_risk_groups.values()))

goods_dp = round(min(goods_risk_groups.values()) / max(goods_risk_groups.values()),2)
print("The dp ratio for Goods Price is ", goods_dp)

dp_ratio_features.update({"AMT_GOODS_PRICE":goods_dp})

In [None]:
dp_ratio_features

#### 10. NAME_TYPE_SUITE

In [None]:
test_df["NAME_TYPE_SUITE"].unique()

In [None]:
test_df_final["NAME_TYPE_SUITE_LE"].unique()

# 0 - Children
# 1 - Family
# 2 - Group of people
# 3 - Other_A
# 4 - Other_B
# 5 - Spouse, partner
# 6 - Unaccompanied
# 7 - Nan

In [None]:
sr_type_suite = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features= test_df_final["NAME_TYPE_SUITE_LE"])
# display(sr.overall)
display(sr_type_suite.by_group)

In [None]:
# 0 - Children
# 1 - Family
# 2 - Group of people
# 3 - Other_A
# 4 - Other_B
# 5 - Spouse, partner
# 6 - Unaccompanied
# 7 - Nan

Others = sr_type_suite.by_group.loc[3:4]

type_suite_groups= {"Children" : round(100*sr_type_suite.by_group.loc[0]),
                    "Family" : round(100*sr_type_suite.by_group.loc[1]),
                    "Group of people" : round(100*sr_type_suite.by_group.loc[2]),
                    "Others" : round(100*np.mean(Others)),
                    "Spouse, partner" : round(100*sr_type_suite.by_group.loc[5]),
                    "Unaccompanied" : round(100*sr_type_suite.by_group.loc[6]),
                    "Unknown" : round(100*sr_type_suite.by_group.loc[7])
                    }


fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(type_suite_groups.keys(),type_suite_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs NAME_TYPE_SUITE")
plt.xlabel('NAME_TYPE_SUITE')
plt.ylabel('Risk Rate in %')
fig.autofmt_xdate()
plt.show()

In [None]:
print("The dp difference for NAME_TYPE_SUITE is ", max(type_suite_groups.values()) - min(type_suite_groups.values()))

type_suite_dp = round(min(type_suite_groups.values()) / max(type_suite_groups.values()),2)
print("The dp ratio for NAME_TYPE_SUITE is ", type_suite_dp)

dp_ratio_features.update({"NAME_TYPE_SUITE": type_suite_dp})

In [None]:
dp_ratio_features

#### 11. NAME_INCOME_TYPE

In [None]:
test_df["NAME_INCOME_TYPE"].unique()

In [None]:
sr_income_type = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features= test_df["NAME_INCOME_TYPE"])
# display(sr.overall)
display(sr_income_type.by_group)

In [None]:
income_type_groups= {"Commercial associate" : round(100*sr_income_type.by_group.loc["Commercial associate"]),
                    "Pensioner" : round(100*sr_income_type.by_group.loc["Pensioner"]),
                    "State servant" : round(100*sr_income_type.by_group.loc["State servant"]),
                    "Working" : round(100*sr_income_type.by_group.loc["Working"]),
                    }


fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(income_type_groups.keys(),income_type_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs NAME_INCOME_TYPE")
plt.xlabel('NAME_INCOME_TYPE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for NAME_INCOME_TYPE is ", max(income_type_groups.values()) - min(income_type_groups.values()))

income_type_dp = round(min(income_type_groups.values()) / max(income_type_groups.values()),2)
print("The dp ratio for NAME_INCOME_TYPE is ", income_type_dp)

dp_ratio_features.update({"NAME_INCOME_TYPE": income_type_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 12. NAME_EDUCATION_TYPE

In [None]:
test_df["NAME_EDUCATION_TYPE"].unique()

In [None]:
sr_education_type = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features= test_df["NAME_EDUCATION_TYPE"])
# display(sr.overall)
display(sr_education_type.by_group)

In [None]:
education_type_groups= {"Higher education" : round(100*sr_education_type.by_group.loc["Higher education"]),
                    "Incomplete higher" : round(100*sr_education_type.by_group.loc["Incomplete higher"]),
                    "Secondary / secondary special" : round(100*sr_education_type.by_group.loc["Secondary / secondary special"]),
                        "Lower secondary" : round(100*sr_education_type.by_group.loc["Lower secondary"])
                    }


fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(education_type_groups.keys(),education_type_groups.values(), width=0.3, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs NAME_EDUCATION_TYPE")
plt.xlabel('NAME_EDUCATION_TYPE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for NAME_EDUCATION_TYPE is ", max(education_type_groups.values()) - min(education_type_groups.values()))

education_type_dp = round(min(education_type_groups.values()) / max(education_type_groups.values()),2)
print("The dp ratio for NAME_EDUCATION_TYPE is ", education_type_dp)

dp_ratio_features.update({"NAME_EDUCATION_TYPE": education_type_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 13.	NAME_FAMILY_STATUS

In [None]:
test_df["NAME_FAMILY_STATUS"].unique()

In [None]:
sr_family = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features= test_df["NAME_FAMILY_STATUS"])
# display(sr.overall)
display(sr_family.by_group)

In [None]:
family_groups= {"Married" : round(100*sr_family.by_group.loc["Married"]),
                "Civil marriage" : round(100*sr_family.by_group.loc["Civil marriage"]),
                "Single / not married" : round(100*sr_family.by_group.loc["Single / not married"]),
                "Separated" : round(100*sr_family.by_group.loc["Separated"]),
                "Widow" : round(100*sr_family.by_group.loc["Widow"])}


fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(family_groups.keys(),family_groups.values(), width=0.3, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs NAME_FAMILY_STATUS")
plt.xlabel('NAME_FAMILY_STATUS')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for NAME_FAMILY_STATUS is ", max(family_groups.values()) - min(family_groups.values()))

family_status_dp = round(min(family_groups.values()) / max(family_groups.values()),2)
print("The dp ratio for NAME_FAMILY_STATUS is ", family_status_dp)

dp_ratio_features.update({"NAME_FAMILY_STATUS": family_status_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 14. NAME_HOUSING_TYPE

In [None]:
test_df["NAME_HOUSING_TYPE"].unique()

In [None]:
sr_housing = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features= test_df["NAME_HOUSING_TYPE"])
# display(sr.overall)
display(sr_housing.by_group)

In [None]:
housing_groups= {"Co-op apartment" : round(100*sr_housing.by_group.loc["Co-op apartment"]),
                "House/apartment" : round(100*sr_housing.by_group.loc["House / apartment"]),
                "Municipal apartment" : round(100*sr_housing.by_group.loc["Municipal apartment"]),
                "Office apartment" : round(100*sr_housing.by_group.loc["Office apartment"]),
                "Rented apartment" : round(100*sr_housing.by_group.loc["Rented apartment"]),
                "With parents" : round(100*sr_housing.by_group.loc["With parents"]),
}


fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(housing_groups.keys(),housing_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs NAME_HOUSING_TYPE")
plt.xlabel('NAME_HOUSING_TYPE')
plt.ylabel('Risk Rate in %')
fig.autofmt_xdate()
plt.show()

In [None]:
print("The dp difference for NAME_HOUSING_TYPE is ", max(housing_groups.values()) - min(housing_groups.values()))

housing_status_dp = round(min(housing_groups.values()) / max(housing_groups.values()),2)
print("The dp ratio for NAME_HOUSING_TYPE is ", housing_status_dp)

dp_ratio_features.update({"NAME_HOUSING_TYPE": housing_status_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 15. 'REGION_POPULATION_RELATIVE'

In [None]:
test_df["REGION_POPULATION_RELATIVE"].min(), test_df["REGION_POPULATION_RELATIVE"].max()

In [None]:
# <0.01
# 0.01 - 0.020
# 0.020 - 0.030
# >0.030

In [None]:
sr_region_population = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["REGION_POPULATION_RELATIVE"])
# display(sr.overall)
display(sr_region_population.by_group)

In [None]:
group1_population = sr_region_population.by_group.loc[:0.01]
group2_population = sr_region_population.by_group.loc[0.01:0.020]
group3_population = sr_region_population.by_group.loc[0.020:0.030]
group4_population = sr_region_population.by_group.loc[0.03:]


group1_risk_rate = 100*round(np.mean(group1_population),2)
group2_risk_rate = 100*round(np.mean(group2_population),2)
group3_risk_rate = 100*round(np.mean(group3_population),2)
group4_risk_rate = 100*round(np.mean(group4_population),2)


region_population_risk_groups= {"<0.01" : group1_risk_rate, "0.01-0.02" : group2_risk_rate, 
                     "0.02-0.03" :  group3_risk_rate, ">0.03": group4_risk_rate}

region_population_risk_groups

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(region_population_risk_groups.keys(),region_population_risk_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs Region Population")
plt.xlabel('Region Population')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for Region Population is ", max(region_population_risk_groups.values()) - min(region_population_risk_groups.values()))

region_population_dp = round(min(region_population_risk_groups.values()) / max(region_population_risk_groups.values()),2)
print("The dp ratio for Region Population is ", region_population_dp)

dp_ratio_features.update({"REGION_POPULATION_RELATIVE":region_population_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 16. AGE

In [None]:
test_df["AGE"].min(), test_df["AGE"].max()

In [None]:
sr_age = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["AGE"])
# display(sr.overall)
display(sr_age.by_group)

In [None]:
young_adults = sr_age.by_group.loc[:25]
adults = sr_age.by_group.loc[26:40]
middle_age_adults = sr_age.by_group.loc[41:60]
older_adults = sr_age.by_group.loc[61:]

young_adults_risk_rate = round(np.mean(young_adults),2)
adults_risk_rate = round(np.mean(adults),2)
middle_age_adults_risk_rate = round(np.mean(middle_age_adults),2)
older_adults_risk_rate = round(np.mean(older_adults),2)


age_risk_groups= {"Young Adults (18-25)" : 100*young_adults_risk_rate,"Adults (26-40)" : 100*adults_risk_rate, 
                  "Middle Age Adults (41 - 60)": 100*middle_age_adults_risk_rate, "Older Adults (60+)":100*older_adults_risk_rate}

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(age_risk_groups.keys(),age_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs Age Group")
plt.xlabel('Age Group')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for Age is ", max(age_risk_groups.values()) - min(age_risk_groups.values()))

age_dp = round(min(age_risk_groups.values()) / max(age_risk_groups.values()),2)
print("The dp ratio for Age is ", age_dp)

dp_ratio_features.update({"AGE":age_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 17. YEARS_EMPLOYED

In [None]:
test_df["YEARS_EMPLOYED"].min(), test_df["YEARS_EMPLOYED"].max()

In [None]:
# <10 
# 10 - 20
# 20 - 30 
# >30

In [None]:
sr_employed = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["YEARS_EMPLOYED"])

display(sr_employed.by_group)

In [None]:
print(len(sr_employed.by_group))

In [None]:
lessthan_10 = sr_employed.by_group.loc[:10]
btw_10_20 = sr_employed.by_group.loc[11:20]
btw_20_30 = sr_employed.by_group.loc[21:30 ]
gt_30 = sr_employed.by_group.loc[31:]

lessthan_10_risk_rate = 100*round(np.mean(lessthan_10),2)
btw_10_20_risk_rate = 100*round(np.mean(btw_10_20),2)
btw_20_30_risk_rate = 100*round(np.mean(btw_20_30),2)
gt_30_risk_rate = 100*round(np.mean(gt_30),2)


years_employed_risk_groups= {"<10" : lessthan_10_risk_rate, "10 - 20" : btw_10_20_risk_rate, 
                     "20 - 30" :  btw_20_30_risk_rate, ">30": gt_30_risk_rate}

years_employed_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(years_employed_risk_groups.keys(),years_employed_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs YEARS_EMPLOYED")
plt.xlabel('YEARS_EMPLOYED')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for YEARS_EMPLOYED is ", max(years_employed_risk_groups.values()) - min(years_employed_risk_groups.values()))

employement_dp = round(min(years_employed_risk_groups.values()) / max(years_employed_risk_groups.values()),2)
print("The dp ratio for YEARS_EMPLOYED is ", employement_dp)

dp_ratio_features.update({"YEARS_EMPLOYED":employement_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 18. YEARS_REGISTRATION

In [None]:
test_df["YEARS_REGISTRATION"].min(), test_df["YEARS_REGISTRATION"].max()

In [None]:
# <10 
# 10 - 20
# 20 - 30 
# >30

In [None]:
sr_registration = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["YEARS_REGISTRATION"])

display(sr_registration.by_group)

In [None]:
lessthan_10 = sr_registration.by_group.loc[:10]
btw_10_20 = sr_registration.by_group.loc[11:20]
btw_20_30 = sr_registration.by_group.loc[21:30 ]
gt_30 = sr_registration.by_group.loc[31:]

lessthan_10_risk_rate = 100*round(np.mean(lessthan_10),2)
btw_10_20_risk_rate = 100*round(np.mean(btw_10_20),2)
btw_20_30_risk_rate = 100*round(np.mean(btw_20_30),2)
gt_30_risk_rate = 100*round(np.mean(gt_30),2)


years_registration_risk_groups= {"<10" : lessthan_10_risk_rate, "10 - 20" : btw_10_20_risk_rate, 
                     "20 - 30" :  btw_20_30_risk_rate, ">30": gt_30_risk_rate}

years_registration_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(years_registration_risk_groups.keys(),years_registration_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs YEARS_REGISTRATION")
plt.xlabel('YEARS_REGISTRATION')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for YEARS_REGISTRATION is ", max(years_registration_risk_groups.values()) - min(years_registration_risk_groups.values()))

registration_dp = round(min(years_registration_risk_groups.values()) / max(years_registration_risk_groups.values()),2)
print("The dp ratio for YEARS_REGISTRATION is ", registration_dp)

dp_ratio_features.update({"YEARS_REGISTRATION":registration_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 19. YEARS_ID_PUBLISH

In [None]:
test_df["YEARS_ID_PUBLISH"].min(), test_df["YEARS_ID_PUBLISH"].max()

In [None]:
# <5
# 5-10
# >10

In [None]:
sr_id_publish = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["YEARS_ID_PUBLISH"])
display(sr_id_publish.by_group)

In [None]:
lessthan_5 = sr_id_publish.by_group.loc[:5]
btw_5_10 = sr_id_publish.by_group.loc[6:10]
gt_10 = sr_id_publish.by_group.loc[11:]

lessthan_5_risk_rate = 100*round(np.mean(lessthan_5),2)
btw_5_10_risk_rate = 100*round(np.mean(btw_5_10),2)
gt_10_risk_rate = 100*round(np.mean(gt_10),2)


id_publish_risk_groups= {"<5" : lessthan_5_risk_rate, "5-10" : btw_5_10_risk_rate, ">10": gt_10_risk_rate}
id_publish_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(id_publish_risk_groups.keys(),id_publish_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs YEARS_ID_PUBLISH")
plt.xlabel('YEARS_ID_PUBLISH')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for YEARS_ID_PUBLISH is ", max(id_publish_risk_groups.values()) - min(id_publish_risk_groups.values()))

id_publish_dp = round(min(id_publish_risk_groups.values()) / max(id_publish_risk_groups.values()),2)
print("The dp ratio for YEARS_ID_PUBLISH is ", id_publish_dp)

dp_ratio_features.update({"YEARS_ID_PUBLISH":id_publish_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 20. FLAG_MOBIL

In [None]:
sr_mobile = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["FLAG_MOBIL"])
# display(sr.overall)
display(sr_mobile.by_group)

In [None]:
# print("The dp difference for FLAG_OWN_REALTY is ", max(realty_risk_groups.values()) - min(realty_risk_groups.values()))

# realty_dp = round(min(realty_risk_groups.values()) / max(realty_risk_groups.values()),2)
# print("The dp ratio for FLAG_OWN_REALTY is ", realty_dp)

dp_ratio_features.update({"FLAG_MOBIL": 0.315})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 21. FLAG_EMP_PHONE

In [None]:
test_df["FLAG_EMP_PHONE"].unique()

In [None]:
sr_emp_mobile = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["FLAG_EMP_PHONE"])
# display(sr.overall)
display(sr_emp_mobile.by_group)

In [None]:
emp_mobile_groups= {"No" : round(100*sr_emp_mobile.by_group.loc["No"]), 
                           "Yes" : round(100*sr_emp_mobile.by_group.loc["Yes"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(emp_mobile_groups.keys(),emp_mobile_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs FLAG_EMP_PHONE")
plt.xlabel('FLAG_EMP_PHONE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for FLAG_EMP_PHONE is ", max(emp_mobile_groups.values()) - min(emp_mobile_groups.values()))

emp_phone_dp = round(min(emp_mobile_groups.values()) / max(emp_mobile_groups.values()),2)
print("The dp ratio for FLAG_EMP_PHONE is ", emp_phone_dp)

dp_ratio_features.update({"FLAG_EMP_PHONE": emp_phone_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 22. FLAG_WORK_PHONE

In [None]:
test_df["FLAG_WORK_PHONE"].unique()

In [None]:
sr_work_phone = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["FLAG_WORK_PHONE"])
# display(sr.overall)
display(sr_work_phone.by_group)

In [None]:
work_phone_groups= {"No" : round(100*sr_work_phone.by_group.loc["No"]), 
                           "Yes" : round(100*sr_work_phone.by_group.loc["Yes"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(work_phone_groups.keys(),work_phone_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs FLAG_WORK_PHONE")
plt.xlabel('FLAG_WORK_PHONE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for FLAG_WORK_PHONE is ", max(work_phone_groups.values()) - min(work_phone_groups.values()))

work_phone_dp = round(min(work_phone_groups.values()) / max(work_phone_groups.values()),2)
print("The dp ratio for FLAG_WORK_PHONE is ", work_phone_dp)

dp_ratio_features.update({"FLAG_WORK_PHONE": work_phone_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 23. FLAG_CONT_MOBILE

In [None]:
test_df["FLAG_CONT_MOBILE"].unique()

In [None]:
sr_cont_mobile = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["FLAG_CONT_MOBILE"])
# display(sr.overall)
display(sr_cont_mobile.by_group)

In [None]:
cont_mobile_groups= {"No" : round(100*sr_cont_mobile.by_group.loc["No"]), 
                           "Yes" : round(100*sr_cont_mobile.by_group.loc["Yes"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(cont_mobile_groups.keys(),cont_mobile_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs FLAG_CONT_MOBILE")
plt.xlabel('FLAG_CONT_MOBILE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for FLAG_CONT_MOBILE is ", max(cont_mobile_groups.values()) - min(cont_mobile_groups.values()))

cont_mobile_dp = round(min(cont_mobile_groups.values()) / max(cont_mobile_groups.values()),2)
print("The dp ratio for FLAG_CONT_MOBILE is ", cont_mobile_dp)

dp_ratio_features.update({"FLAG_CONT_MOBILE": cont_mobile_dp})

In [None]:
dp_ratio_features.update({"FLAG_MOBIL": 0.32})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 24. FLAG_PHONE

In [None]:
test_df["FLAG_PHONE"].unique()

In [None]:
sr_phone = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["FLAG_PHONE"])
# display(sr.overall)
display(sr_phone.by_group)

In [None]:
phone_groups= {"No" : round(100*sr_phone.by_group.loc["No"]), 
                           "Yes" : round(100*sr_phone.by_group.loc["Yes"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(phone_groups.keys(),phone_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs FLAG_PHONE")
plt.xlabel('FLAG_PHONE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for FLAG_PHONE is ", max(phone_groups.values()) - min(phone_groups.values()))

phone_dp = round(min(phone_groups.values()) / max(phone_groups.values()),2)
print("The dp ratio for FLAG_PHONE is ", phone_dp)

dp_ratio_features.update({"FLAG_PHONE": phone_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 25. FLAG_EMAIL

In [None]:
test_df["FLAG_EMAIL"].unique()

In [None]:
sr_email = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["FLAG_EMAIL"])

display(sr_email.by_group)

In [None]:
email_groups= {"No" : round(100*sr_email.by_group.loc["No"]), 
                           "Yes" : round(100*sr_email.by_group.loc["Yes"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(email_groups.keys(),email_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs FLAG_EMAIL")
plt.xlabel('FLAG_EMAIL')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for FLAG_EMAIL is ", max(email_groups.values()) - min(email_groups.values()))

email_dp = round(min(email_groups.values()) / max(email_groups.values()),2)
print("The dp ratio for FLAG_EMAIL is ", email_dp)

dp_ratio_features.update({"FLAG_EMAIL": email_dp})

In [None]:
# del dp_ratio_features["email_dp"]

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 26. OCCUPATION_TYPE

In [None]:
test_df_fornull = test_df["OCCUPATION_TYPE"]
test_df_fornull.fillna("Unknown", inplace = True)

In [None]:
sr_occupation = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df_fornull)

display(sr_occupation.by_group)

In [None]:
test_df["OCCUPATION_TYPE"].unique()

In [None]:
occupation_groups= {"Accountants" : round(100*sr_occupation.by_group.loc["Accountants"]), 
                    "Cleaning staff" : round(100*sr_occupation.by_group.loc["Cleaning staff"]),
                   "Cooking staff" : round(100*sr_occupation.by_group.loc["Cooking staff"]),
                   "Core staff" : round(100*sr_occupation.by_group.loc["Core staff"]),
                    "Drivers" : round(100*sr_occupation.by_group.loc["Drivers"]),
                   "HR staff" : round(100*sr_occupation.by_group.loc["HR staff"]),
                   "High skill tech staff" : round(100*sr_occupation.by_group.loc["High skill tech staff"]),
                   "IT staff" : round(100*sr_occupation.by_group.loc["IT staff"]),
                   "Laborers" : round(100*sr_occupation.by_group.loc["Laborers"]),
                   "Low-skill Laborers" : round(100*sr_occupation.by_group.loc["Low-skill Laborers"]),
                   "Managers" : round(100*sr_occupation.by_group.loc["Managers"]),
                   "Medicine staff" : round(100*sr_occupation.by_group.loc["Medicine staff"]),
                   "Private service staff" : round(100*sr_occupation.by_group.loc["Private service staff"]),
                   "Realty agents" : round(100*sr_occupation.by_group.loc["Realty agents"]),
                   "Sales staff" : round(100*sr_occupation.by_group.loc["Sales staff"]),
                   "Secretaries" : round(100*sr_occupation.by_group.loc["Secretaries"]),
                   "Security staff" : round(100*sr_occupation.by_group.loc["Security staff"]),
                   "Waiters/barmen staff" : round(100*sr_occupation.by_group.loc["Waiters/barmen staff"]),
#                     "Unknown" : round(100*sr_occupation.by_group.loc["Unknown"])
                   }

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(occupation_groups.keys(),occupation_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs OCCUPATION_TYPE")
plt.xlabel('OCCUPATION_TYPE')
plt.ylabel('Risk Rate in %')
fig.autofmt_xdate()
plt.show()

In [None]:
print("The dp difference for OCCUPATION_TYPE is ", max(occupation_groups.values()) - min(occupation_groups.values()))

occupation_dp = round(min(occupation_groups.values()) / max(occupation_groups.values()),2)
print("The dp ratio for OCCUPATION_TYPE is ", occupation_dp)

dp_ratio_features.update({"OCCUPATION_TYPE": occupation_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 27. CNT_FAM_MEMBERS

In [None]:
test_df["CNT_FAM_MEMBERS"].min(), test_df["CNT_FAM_MEMBERS"].max()

In [None]:
# <=2
# 3-5
# >=5

In [None]:
sr_family_members = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["CNT_FAM_MEMBERS"])
display(sr_family_members.by_group)

In [None]:
lessthan_2 = sr_family_members.by_group.loc[:2]
btw_3_5 = sr_family_members.by_group.loc[3:4]
gt_5 = sr_family_members.by_group.loc[5:]

lessthan_2_risk_rate = 100*round(np.mean(lessthan_2),2)
btw_3_5_risk_rate = 100*round(np.mean(btw_3_5),2)
gt_5_risk_rate = 100*round(np.mean(gt_5),2)


family_members_risk_groups= {"<=2" : lessthan_2_risk_rate, "3-5" : btw_3_5_risk_rate, ">=5": gt_5_risk_rate}
family_members_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(family_members_risk_groups.keys(),family_members_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs CNT_FAM_MEMBERS")
plt.xlabel('CNT_FAM_MEMBERS')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for CNT_FAM_MEMBERS is ", max(family_members_risk_groups.values()) - min(family_members_risk_groups.values()))

fam_mem_dp = round(min(family_members_risk_groups.values()) / max(family_members_risk_groups.values()),2)
print("The dp ratio for CNT_FAM_MEMBERS is ", fam_mem_dp)

dp_ratio_features.update({"CNT_FAM_MEMBERS":fam_mem_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 28. REGION_RATING_CLIENT

In [None]:
test_df["REGION_RATING_CLIENT"].unique()

In [None]:
sr_region_rating = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["REGION_RATING_CLIENT"])

display(sr_region_rating.by_group)

In [None]:
region_rating_groups= {"1" : round(100*sr_region_rating.by_group.loc[1]), 
                        "2" : round(100*sr_region_rating.by_group.loc[2]),
                        "3" : round(100*sr_region_rating.by_group.loc[3])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(region_rating_groups.keys(),region_rating_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs REGION_RATING_CLIENT")
plt.xlabel('REGION_RATING_CLIENT')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for REGION_RATING_CLIENT is ", max(region_rating_groups.values()) - min(region_rating_groups.values()))

region_rating_dp = round(min(region_rating_groups.values()) / max(region_rating_groups.values()),2)
print("The dp ratio for REGION_RATING_CLIENT is ", region_rating_dp)

dp_ratio_features.update({"REGION_RATING_CLIENT": region_rating_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 29. REGION_RATING_CLIENT_W_CITY

In [None]:
test_df["REGION_RATING_CLIENT_W_CITY"].unique()

In [None]:
sr_region_rating_w_city = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["REGION_RATING_CLIENT_W_CITY"])

display(sr_region_rating_w_city.by_group)

In [None]:
region_rating_w_city_groups= {"1" : round(100*sr_region_rating_w_city.by_group.loc[1]), 
                        "2" : round(100*sr_region_rating_w_city.by_group.loc[2]),
                        "3" : round(100*sr_region_rating_w_city.by_group.loc[3])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(region_rating_w_city_groups.keys(),region_rating_w_city_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs REGION_RATING_CLIENT_W_CITY")
plt.xlabel('REGION_RATING_CLIENT_W_CITY')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for REGION_RATING_CLIENT_W_CITY is ", max(region_rating_w_city_groups.values()) - min(region_rating_w_city_groups.values()))

region_rating_w_city_dp = round(min(region_rating_w_city_groups.values()) / max(region_rating_w_city_groups.values()),2)
print("The dp ratio for REGION_RATING_CLIENT_W_CITY is ", region_rating_w_city_dp)

dp_ratio_features.update({"REGION_RATING_CLIENT_W_CITY": region_rating_w_city_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 30. WEEKDAY_APPR_PROCESS_START

In [None]:
test_df["WEEKDAY_APPR_PROCESS_START"].unique()

In [None]:
sr_weekday = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["WEEKDAY_APPR_PROCESS_START"])

display(sr_weekday.by_group)

In [None]:
weekday_groups= {"SUNDAY" : round(100*sr_weekday.by_group.loc["SUNDAY"]),
                 "MONDAY" : round(100*sr_weekday.by_group.loc["MONDAY"]), 
                "TUESDAY" : round(100*sr_weekday.by_group.loc["TUESDAY"]),
                "WEDNESDAY" : round(100*sr_weekday.by_group.loc["WEDNESDAY"]),
                "THURSDAY" : round(100*sr_weekday.by_group.loc["THURSDAY"]),
                "FRIDAY" : round(100*sr_weekday.by_group.loc["FRIDAY"]),
                "SATURDAY" : round(100*sr_weekday.by_group.loc["SATURDAY"])
                   }

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(weekday_groups.keys(),weekday_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs WEEKDAY_APPR_PROCESS_START")
plt.xlabel('WEEKDAY_APPR_PROCESS_START')
plt.ylabel('Risk Rate in %')
fig.autofmt_xdate()
plt.show()

In [None]:
print("The dp difference for WEEKDAY_APPR_PROCESS_START is ", max(weekday_groups.values()) - min(weekday_groups.values()))

weekday_dp = round(min(weekday_groups.values()) / max(weekday_groups.values()),2)
print("The dp ratio for WEEKDAY_APPR_PROCESS_START is ", weekday_dp)

dp_ratio_features.update({"WEEKDAY_APPR_PROCESS_START": weekday_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 31. HOUR_APPR_PROCESS_START

In [None]:
test_df["HOUR_APPR_PROCESS_START"].min(), test_df["HOUR_APPR_PROCESS_START"].max()

In [None]:

# Before 9 am (Before Office Hours) : 0-9
# 9am to 1 pm (Office Hours: Morning): 9-13
# 1pm to 5 pm(Office Hours:Afternoon): 13 - 17   
# 5pm - 9 am (After Office Hours): >17
# <=2
# 3-5
# >=5

In [None]:
sr_hour_application = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["HOUR_APPR_PROCESS_START"])
display(sr_hour_application.by_group)

In [None]:
display(len(sr_hour_application.by_group))

In [None]:
before_9 = sr_hour_application.by_group.loc[0:8]
btw_9_13 = sr_hour_application.by_group.loc[9:13]
btw_13_17 = sr_hour_application.by_group.loc[14:17]
after_17 = sr_hour_application.by_group.loc[18:24]

# display(len(before_9))
# display(len(btw_9_13))
# display(len(btw_13_17))
# display(len(after_17))

before_9_risk_rate = 100*round(np.mean(before_9),2)
btw_9_13_risk_rate = 100*round(np.mean(btw_9_13),2)
btw_13_17_risk_rate = 100*round(np.mean(btw_13_17),2)
after_17_risk_rate = 100*round(np.mean(after_17),2)


hour_application_risk_groups= {"Midnight to 9 am (Before Office Hours) : 0-9" : before_9_risk_rate, 
                               "9am to 1 pm (Office Hours: Morning): 9-13" : btw_9_13_risk_rate, 
                               "1pm to 5 pm(Office Hours:Afternoon): 13 - 17": btw_13_17_risk_rate,
                                "5pm - Midnight (After Office Hours): >17": after_17_risk_rate}
hour_application_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(hour_application_risk_groups.keys(),hour_application_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs HOUR_APPR_PROCESS_START")
plt.xlabel('HOUR_APPR_PROCESS_START')
plt.ylabel('Risk Rate in %')
fig.autofmt_xdate()
plt.show()

In [None]:
print("The dp difference for HOUR_APPR_PROCESS_START is ", max(hour_application_risk_groups.values()) - min(hour_application_risk_groups.values()))

hour_application_dp = round(min(hour_application_risk_groups.values()) / max(hour_application_risk_groups.values()),2)
print("The dp ratio for HOUR_APPR_PROCESS_START is ", hour_application_dp)

dp_ratio_features.update({"HOUR_APPR_PROCESS_START":hour_application_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 32. REG_REGION_NOT_LIVE_REGION

In [None]:
test_df_final["REG_REGION_NOT_LIVE_REGION"].unique()

In [None]:
sr_region_live = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["REG_REGION_NOT_LIVE_REGION"])

display(sr_region_live.by_group)

In [None]:
region_live_groups= {"Same" : round(100*sr_region_live.by_group.loc["Same"]), 
                           "Different" : round(100*sr_region_live.by_group.loc["Different"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(region_live_groups.keys(),region_live_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs REG_REGION_NOT_LIVE_REGION")
plt.xlabel('REG_REGION_NOT_LIVE_REGION')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for REG_REGION_NOT_LIVE_REGION is ", max(region_live_groups.values()) - min(region_live_groups.values()))

region_live_dp = round(min(region_live_groups.values()) / max(region_live_groups.values()),2)
print("The dp ratio for REG_REGION_NOT_LIVE_REGION is ", region_live_dp)

# dp_ratio_features.update({"email_dp": email_dp})
dp_ratio_features.update({"REG_REGION_NOT_LIVE_REGION": region_live_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 33. REG_REGION_NOT_WORK_REGION

In [None]:
test_df["REG_REGION_NOT_WORK_REGION"].unique()

In [None]:
sr_region_work = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["REG_REGION_NOT_WORK_REGION"])

display(sr_region_work.by_group)

In [None]:
region_work_groups= {"Same" : round(100*sr_region_work.by_group.loc["Same"]), 
                           "Different" : round(100*sr_region_work.by_group.loc["Different"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(region_work_groups.keys(),region_work_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs REG_REGION_NOT_WORK_REGION")
plt.xlabel('REG_REGION_NOT_WORK_REGION')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for REG_REGION_NOT_WORK_REGION is ", max(region_work_groups.values()) - min(region_work_groups.values()))

region_work_dp = round(min(region_work_groups.values()) / max(region_work_groups.values()),2)
print("The dp ratio for REG_REGION_NOT_WORK_REGION is ", region_work_dp)

dp_ratio_features.update({"REG_REGION_NOT_WORK_REGION": region_work_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 34. LIVE_REGION_NOT_WORK_REGION

In [None]:
test_df["LIVE_REGION_NOT_WORK_REGION"].unique()

In [None]:
sr_live_work = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["LIVE_REGION_NOT_WORK_REGION"])

display(sr_live_work.by_group)

In [None]:
live_work_groups= {"Same" : round(100*sr_live_work.by_group.loc["Same"]), 
                           "Different" : round(100*sr_live_work.by_group.loc["Different"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(live_work_groups.keys(),live_work_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs LIVE_REGION_NOT_WORK_REGION")
plt.xlabel('LIVE_REGION_NOT_WORK_REGION')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for LIVE_REGION_NOT_WORK_REGION is ", max(live_work_groups.values()) - min(live_work_groups.values()))

live_work_dp = round(min(live_work_groups.values()) / max(live_work_groups.values()),2)
print("The dp ratio for LIVE_REGION_NOT_WORK_REGION is ", live_work_dp)

dp_ratio_features.update({"LIVE_REGION_NOT_WORK_REGION": live_work_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 35. REG_CITY_NOT_LIVE_CITY

In [None]:
test_df["REG_CITY_NOT_LIVE_CITY"].unique()

In [None]:
sr_city_live = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["REG_CITY_NOT_LIVE_CITY"])

display(sr_city_live.by_group)

In [None]:
city_live_groups= {"Same" : round(100*sr_city_live.by_group.loc["Same"]), 
                           "Different" : round(100*sr_city_live.by_group.loc["Different"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(city_live_groups.keys(),city_live_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs REG_CITY_NOT_LIVE_CITY")
plt.xlabel('REG_CITY_NOT_LIVE_CITY')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for REG_CITY_NOT_LIVE_CITY is ", max(city_live_groups.values()) - min(city_live_groups.values()))

city_live_dp = round(min(city_live_groups.values()) / max(city_live_groups.values()),2)
print("The dp ratio for REG_CITY_NOT_LIVE_CITY is ", city_live_dp)

dp_ratio_features.update({"REG_CITY_NOT_LIVE_CITY": city_live_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 36. REG_CITY_NOT_WORK_CITY

In [None]:
test_df["REG_CITY_NOT_WORK_CITY"].unique()

In [None]:
sr_city_work = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["REG_CITY_NOT_WORK_CITY"])

display(sr_city_work.by_group)

In [None]:
city_work_groups= {"Same" : round(100*sr_city_work.by_group.loc["Same"]), 
                           "Different" : round(100*sr_city_work.by_group.loc["Different"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(city_work_groups.keys(),city_work_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs REG_CITY_NOT_WORK_CITY")
plt.xlabel('REG_CITY_NOT_WORK_CITY')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for REG_CITY_NOT_WORK_CITY is ", max(city_work_groups.values()) - min(city_work_groups.values()))

city_work_dp = round(min(city_work_groups.values()) / max(city_work_groups.values()),2)
print("The dp ratio for REG_CITY_NOT_WORK_CITY is ", city_work_dp)

dp_ratio_features.update({"REG_CITY_NOT_WORK_CITY": city_work_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 36. LIVE_CITY_NOT_WORK_CITY

In [None]:
test_df["LIVE_CITY_NOT_WORK_CITY"].unique()

In [None]:
sr_live_work_city = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["LIVE_CITY_NOT_WORK_CITY"])

display(sr_live_work_city.by_group)

In [None]:
live_work_city_groups= {"Same" : round(100*sr_live_work_city.by_group.loc["Same"]), 
                           "Different" : round(100*sr_live_work_city.by_group.loc["Different"])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(live_work_city_groups.keys(),live_work_city_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs LIVE_CITY_NOT_WORK_CITY")
plt.xlabel('LIVE_CITY_NOT_WORK_CITY')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for LIVE_CITY_NOT_WORK_CITY is ", max(live_work_city_groups.values()) - min(live_work_city_groups.values()))

live_work_city_dp = round(min(live_work_city_groups.values()) / max(live_work_city_groups.values()),2)
print("The dp ratio for LIVE_CITY_NOT_WORK_CITY is ", live_work_city_dp)

dp_ratio_features.update({"LIVE_CITY_NOT_WORK_CITY": live_work_city_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 38. ORGANIZATION_TYPE

In [None]:
test_df["ORGANIZATION_TYPE"].unique()

In [None]:
sr_organisation = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df["ORGANIZATION_TYPE"])

display(sr_organisation.by_group)

In [None]:
# Groups

Education = ['Kindergarten', 'School' , 'University']
Industry = ["Industry: type 1", "Industry: type 11", "Industry: type 12", "Industry: type 3", 
            "Industry: type 4", "Industry: type 5", "Industry: type 7", "Industry: type 9"]
Buisness_Entity = ["Business Entity Type 1","Business Entity Type 2","Business Entity Type 3"]
Trade= ["Trade: type 2","Trade: type 3","Trade: type 6","Trade: type 7"]
Transport= ["Transport: type 1","Transport: type 2","Transport: type 3", "Transport: type 4"]
Public_Sector=["Government", "Housing", "Military", "Police", "Postal", 'Security Ministries', "Medicine"]
Private_Sector= ['Advertising', 'Security', "Agriculture", "Electricity", "Hotel", "Mobile", "Restaurant", "Self-employed","Telecom"]
Unknown=["XNA"]
Others = ["Other", "Services", "Bank", "Construction", "Emergency", "Legal Services"]


In [None]:
print(len(sr_organisation.by_group))

In [None]:
Education_group = sr_organisation.by_group.loc[Education]
Industry_group = sr_organisation.by_group.loc[Industry]
Buisness_Entity_group = sr_organisation.by_group.loc[Buisness_Entity]
Trade_group = sr_organisation.by_group.loc[Trade]
Transport_group = sr_organisation.by_group.loc[Transport]
Public_Sector_group = sr_organisation.by_group.loc[Public_Sector]
Private_Sector_group = sr_organisation.by_group.loc[Private_Sector]
Others_group = sr_organisation.by_group.loc[Others]
Unknown_group = sr_organisation.by_group.loc[Unknown]


Education_risk_rate = 100*round(np.mean(Education_group),2)
Industry_risk_rate = 100*round(np.mean(Industry_group),2)
Buisness_Entity_risk_rate = 100*round(np.mean(Buisness_Entity_group),2)

Trade_risk_rate = 100*round(np.mean(Trade_group),2)
Transport_risk_rate = 100*round(np.mean(Transport_group),2)
Public_Sector_risk_rate = 100*round(np.mean(Public_Sector_group),2)
Private_Sector_risk_rate = 100*round(np.mean(Private_Sector_group),2)
Others_risk_rate = 100*round(np.mean(Others_group),2)
Unknown_risk_rate = 100*round(np.mean(Unknown_group),2)



organisation_groups= {"Education" : Education_risk_rate, 
                    "Industry" :Industry_risk_rate ,
                   "Buisness Entity" : Buisness_Entity_risk_rate,
                    "Trade" :round(Trade_risk_rate),
                   "Transport" : Transport_risk_rate,
                   "Public Sector" : Public_Sector_risk_rate,
                   "Private Sector" : Private_Sector_risk_rate,
                   "Others" : Others_risk_rate,
                "XNA" : Unknown_risk_rate,
                   }

organisation_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(organisation_groups.keys(),organisation_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs ORGANIZATION_TYPE")
plt.xlabel('ORGANIZATION_TYPE')
plt.ylabel('Risk Rate in %')
fig.autofmt_xdate()
plt.show()

In [None]:
print("The dp difference for ORGANIZATION_TYPE is ", max(organisation_groups.values()) - min(organisation_groups.values()))

organisation_dp = round(min(organisation_groups.values()) / max(organisation_groups.values()),2)
print("The dp ratio for ORGANIZATION_TYPE is ", organisation_dp)

dp_ratio_features.update({"ORGANIZATION_TYPE": organisation_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 39. OBS_30_CNT_SOCIAL_CIRCLE

In [None]:
test_df["OBS_30_CNT_SOCIAL_CIRCLE"].min(), test_df["OBS_30_CNT_SOCIAL_CIRCLE"].max()

In [None]:
# <=5
# 6-10
# >10

In [None]:
sr_OBS_30 = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["OBS_30_CNT_SOCIAL_CIRCLE"])
display(sr_OBS_30.by_group)

In [None]:
lessthan_5 = sr_OBS_30.by_group.loc[:5]
btw_6_10 = sr_OBS_30.by_group.loc[6:9]
gt_10 = sr_OBS_30.by_group.loc[10:]

lessthan_5_risk_rate = 100*round(np.mean(lessthan_5),2)
btw_6_10_risk_rate = 100*round(np.mean(btw_6_10),2)
gt_10_risk_rate = 100*round(np.mean(gt_10),2)


OBS_30_risk_groups= {"<=5" : lessthan_5_risk_rate, "5-9" : btw_6_10_risk_rate, ">=10": gt_10_risk_rate}
OBS_30_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(OBS_30_risk_groups.keys(),OBS_30_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs OBS_30_CNT_SOCIAL_CIRCLE")
plt.xlabel('OBS_30_CNT_SOCIAL_CIRCLE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for OBS_30_CNT_SOCIAL_CIRCLE is ", max(OBS_30_risk_groups.values()) - min(OBS_30_risk_groups.values()))

OBS_30_dp = round(min(OBS_30_risk_groups.values()) / max(OBS_30_risk_groups.values()),2)
print("The dp ratio for OBS_30_CNT_SOCIAL_CIRCLE is ", OBS_30_dp)

dp_ratio_features.update({"OBS_30_CNT_SOCIAL_CIRCLE":OBS_30_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 40. DEF_30_CNT_SOCIAL_CIRCLE

In [None]:
test_df["DEF_30_CNT_SOCIAL_CIRCLE"].unique()

In [None]:
sr_DEF_30 = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["DEF_30_CNT_SOCIAL_CIRCLE"])
display(sr_DEF_30.by_group)

In [None]:
# Groups
# 0-2
# =>2

In [None]:
lessthan_2 = sr_DEF_30.by_group.loc[0:1]
equal_gt_2 = sr_DEF_30.by_group.loc[2:]

lessthan_2_risk_rate = 100*round(np.mean(lessthan_2),2)
equal_gt_2_risk_rate = 100*round(np.mean(equal_gt_2),2)


DEF_30_risk_groups= {"0-2" : lessthan_2_risk_rate, ">=2" : equal_gt_2_risk_rate}
DEF_30_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(DEF_30_risk_groups.keys(),DEF_30_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs DEF_30_CNT_SOCIAL_CIRCLE")
plt.xlabel('DEF_30_CNT_SOCIAL_CIRCLE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for DEF_30_CNT_SOCIAL_CIRCLE is ", max(DEF_30_risk_groups.values()) - min(DEF_30_risk_groups.values()))

DEF_30_dp = round(min(DEF_30_risk_groups.values()) / max(DEF_30_risk_groups.values()),2)
print("The dp ratio for DEF_30_CNT_SOCIAL_CIRCLE is ", DEF_30_dp)

dp_ratio_features.update({"DEF_30_CNT_SOCIAL_CIRCLE":DEF_30_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 41. OBS_60_CNT_SOCIAL_CIRCLE

In [None]:
test_df["OBS_60_CNT_SOCIAL_CIRCLE"].min(), test_df["OBS_60_CNT_SOCIAL_CIRCLE"].max()

In [None]:
# <=5
# 6-10
# >10

In [None]:
sr_OBS_60 = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["OBS_60_CNT_SOCIAL_CIRCLE"])
display(sr_OBS_60.by_group)

In [None]:
lessthan_5 = sr_OBS_60.by_group.loc[:5]
btw_6_10 = sr_OBS_60.by_group.loc[6:9]
gt_10 = sr_OBS_60.by_group.loc[10:]

lessthan_5_risk_rate = 100*round(np.mean(lessthan_5),2)
btw_6_10_risk_rate = 100*round(np.mean(btw_6_10),2)
gt_10_risk_rate = 100*round(np.mean(gt_10),2)


OBS_60_risk_groups= {"<=5" : lessthan_5_risk_rate, "5-9" : btw_6_10_risk_rate, ">=10": gt_10_risk_rate}
OBS_60_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(OBS_60_risk_groups.keys(),OBS_60_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs OBS_60_CNT_SOCIAL_CIRCLE")
plt.xlabel('OBS_60_CNT_SOCIAL_CIRCLE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for OBS_60_CNT_SOCIAL_CIRCLE is ", max(OBS_60_risk_groups.values()) - min(OBS_60_risk_groups.values()))

OBS_60_dp = round(min(OBS_60_risk_groups.values()) / max(OBS_60_risk_groups.values()),2)
print("The dp ratio for OBS_60_CNT_SOCIAL_CIRCLE is ", OBS_60_dp)

dp_ratio_features.update({"OBS_60_CNT_SOCIAL_CIRCLE":OBS_60_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 42. DEF_60_CNT_SOCIAL_CIRCLE

In [None]:
test_df["DEF_60_CNT_SOCIAL_CIRCLE"].unique()

In [None]:
sr_DEF_60 = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["DEF_60_CNT_SOCIAL_CIRCLE"])
display(sr_DEF_60.by_group)

In [None]:
# Groups
# 0-2
# =>2

In [None]:
lessthan_2 = sr_DEF_60.by_group.loc[0:1]
equal_gt_2 = sr_DEF_60.by_group.loc[2:]

lessthan_2_risk_rate = 100*round(np.mean(lessthan_2),2)
equal_gt_2_risk_rate = 100*round(np.mean(equal_gt_2),2)


DEF_60_risk_groups= {"0-2" : lessthan_2_risk_rate, ">=2" : equal_gt_2_risk_rate}
DEF_60_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(DEF_60_risk_groups.keys(),DEF_60_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)

plt.title("Risk Rate vs DEF_60_CNT_SOCIAL_CIRCLE")
plt.xlabel('DEF_60_CNT_SOCIAL_CIRCLE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for DEF_60_CNT_SOCIAL_CIRCLE is ", max(DEF_60_risk_groups.values()) - min(DEF_60_risk_groups.values()))

DEF_60_dp = round(min(DEF_60_risk_groups.values()) / max(DEF_60_risk_groups.values()),2)
print("The dp ratio for DEF_60_CNT_SOCIAL_CIRCLE is ", DEF_60_dp)

dp_ratio_features.update({"DEF_60_CNT_SOCIAL_CIRCLE":DEF_60_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 43. YEARS_LAST_PHONE_CHANGE

In [None]:
test_df["YEARS_LAST_PHONE_CHANGE"].min(), test_df["YEARS_LAST_PHONE_CHANGE"].max()

In [None]:
# <=2
# 2-5
# >5

In [None]:
sr_phone_change = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df["YEARS_LAST_PHONE_CHANGE"])
display(sr_phone_change.by_group)

In [None]:
lessthan_2 = sr_phone_change.by_group.loc[:2]
btw_2_5 = sr_phone_change.by_group.loc[3:5]
gt_5 = sr_phone_change.by_group.loc[6:]

lessthan_2_risk_rate = 100*round(np.mean(lessthan_2),2)
btw_2_5_risk_rate = 100*round(np.mean(btw_2_5),2)
gt_5_risk_rate = 100*round(np.mean(gt_5),2)


phone_change_risk_groups= {"<=2" : lessthan_2_risk_rate, "3-5" : btw_2_5_risk_rate, ">5": gt_5_risk_rate}
phone_change_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(phone_change_risk_groups.keys(),phone_change_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs YEARS_LAST_PHONE_CHANGE")
plt.xlabel('YEARS_LAST_PHONE_CHANGE')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for YEARS_LAST_PHONE_CHANGE is ", max(phone_change_risk_groups.values()) - min(phone_change_risk_groups.values()))

phone_change_dp = round(min(phone_change_risk_groups.values()) / max(phone_change_risk_groups.values()),2)
print("The dp ratio for YEARS_LAST_PHONE_CHANGE is ", phone_change_dp)

dp_ratio_features.update({"YEARS_LAST_PHONE_CHANGE":phone_change_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 44. AMT_REQ_CREDIT_BUREAU_HOUR

In [None]:
test_df["AMT_REQ_CREDIT_BUREAU_HOUR"]

In [None]:
sr_AMT_REQ_CREDIT_BUREAU_HOUR = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df_final["AMT_REQ_CREDIT_BUREAU_HOUR"])

display(sr_AMT_REQ_CREDIT_BUREAU_HOUR.by_group)

In [None]:
AMT_REQ_CREDIT_BUREAU_HOUR_groups= {"0" : round(100*sr_AMT_REQ_CREDIT_BUREAU_HOUR.by_group.loc[0]), 
                           "1" : round(100*sr_AMT_REQ_CREDIT_BUREAU_HOUR.by_group.loc[1])}
#                   "Unknown" : round(100*sr_AMT_REQ_CREDIT_BUREAU_HOUR.by_group.loc[0.0033821871476888386])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(AMT_REQ_CREDIT_BUREAU_HOUR_groups.keys(),AMT_REQ_CREDIT_BUREAU_HOUR_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs AMT_REQ_CREDIT_BUREAU_HOUR")
plt.xlabel('AMT_REQ_CREDIT_BUREAU_HOUR')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for AMT_REQ_CREDIT_BUREAU_HOUR is ", max(AMT_REQ_CREDIT_BUREAU_HOUR_groups.values()) - min(AMT_REQ_CREDIT_BUREAU_HOUR_groups.values()))

AMT_REQ_CREDIT_BUREAU_HOUR_dp = round(min(AMT_REQ_CREDIT_BUREAU_HOUR_groups.values()) / max(AMT_REQ_CREDIT_BUREAU_HOUR_groups.values()),2)
print("The dp ratio for AMT_REQ_CREDIT_BUREAU_HOUR is ", AMT_REQ_CREDIT_BUREAU_HOUR_dp)

dp_ratio_features.update({"AMT_REQ_CREDIT_BUREAU_HOUR": AMT_REQ_CREDIT_BUREAU_HOUR_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 45. AMT_REQ_CREDIT_BUREAU_DAY

In [None]:
test_df["AMT_REQ_CREDIT_BUREAU_DAY"].unique()

In [None]:
sr_AMT_REQ_CREDIT_BUREAU_DAY = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df_final["AMT_REQ_CREDIT_BUREAU_DAY"])

display(sr_AMT_REQ_CREDIT_BUREAU_DAY.by_group)

In [None]:
# sr_AMT_REQ_CREDIT_BUREAU_DAY.by_group.keys()

In [None]:
AMT_REQ_CREDIT_BUREAU_DAY_groups= {"0" : round(100*sr_AMT_REQ_CREDIT_BUREAU_DAY.by_group.loc[0]), 
                           "1" : round(100*sr_AMT_REQ_CREDIT_BUREAU_DAY.by_group.loc[1])}
#                   "Unknown" : round(100*sr_AMT_REQ_CREDIT_BUREAU_DAY.by_group.loc[0.0011273957158962795])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(AMT_REQ_CREDIT_BUREAU_DAY_groups.keys(),AMT_REQ_CREDIT_BUREAU_DAY_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs AMT_REQ_CREDIT_BUREAU_DAY")
plt.xlabel('AMT_REQ_CREDIT_BUREAU_DAY')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for AMT_REQ_CREDIT_BUREAU_DAY is ", max(AMT_REQ_CREDIT_BUREAU_DAY_groups.values()) - min(AMT_REQ_CREDIT_BUREAU_DAY_groups.values()))

AMT_REQ_CREDIT_BUREAU_DAY_dp = round(min(AMT_REQ_CREDIT_BUREAU_DAY_groups.values()) / max(AMT_REQ_CREDIT_BUREAU_DAY_groups.values()),2)
print("The dp ratio for AMT_REQ_CREDIT_BUREAU_DAY is ", AMT_REQ_CREDIT_BUREAU_DAY_dp)

dp_ratio_features.update({"AMT_REQ_CREDIT_BUREAU_DAY": AMT_REQ_CREDIT_BUREAU_DAY_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 46. AMT_REQ_CREDIT_BUREAU_WEEK

In [None]:
sr_AMT_REQ_CREDIT_BUREAU_WEEK = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df_final["AMT_REQ_CREDIT_BUREAU_WEEK"])

display(sr_AMT_REQ_CREDIT_BUREAU_WEEK.by_group)

In [None]:
# sr_AMT_REQ_CREDIT_BUREAU_DAY.by_group.keys()

In [None]:
AMT_REQ_CREDIT_BUREAU_WEEK_groups= {"0" : round(100*sr_AMT_REQ_CREDIT_BUREAU_WEEK.by_group.loc[0]), 
                           "1" : round(100*sr_AMT_REQ_CREDIT_BUREAU_WEEK.by_group.loc[1])}
#                   "Unknown" : round(100*sr_AMT_REQ_CREDIT_BUREAU_WEEK.by_group.loc[0.0011273957158962795])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(AMT_REQ_CREDIT_BUREAU_WEEK_groups.keys(),AMT_REQ_CREDIT_BUREAU_WEEK_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs AMT_REQ_CREDIT_BUREAU_WEEK")
plt.xlabel('AMT_REQ_CREDIT_BUREAU_WEEK')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for AMT_REQ_CREDIT_BUREAU_WEEK is ", max(AMT_REQ_CREDIT_BUREAU_WEEK_groups.values()) - min(AMT_REQ_CREDIT_BUREAU_WEEK_groups.values()))

AMT_REQ_CREDIT_BUREAU_WEEK_dp = round(min(AMT_REQ_CREDIT_BUREAU_WEEK_groups.values()) / max(AMT_REQ_CREDIT_BUREAU_WEEK_groups.values()),2)
print("The dp ratio for AMT_REQ_CREDIT_BUREAU_WEEK is ", AMT_REQ_CREDIT_BUREAU_WEEK_dp)

dp_ratio_features.update({"AMT_REQ_CREDIT_BUREAU_WEEK": AMT_REQ_CREDIT_BUREAU_WEEK_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 47. AMT_REQ_CREDIT_BUREAU_MON

In [None]:
sr_AMT_REQ_CREDIT_BUREAU_MON = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test, sensitive_features=test_df_final["AMT_REQ_CREDIT_BUREAU_MON"])

display(sr_AMT_REQ_CREDIT_BUREAU_MON.by_group)

In [None]:
# sr_AMT_REQ_CREDIT_BUREAU_MON.by_group.keys()

In [None]:
AMT_REQ_CREDIT_BUREAU_MON_groups= {"0" : round(100*sr_AMT_REQ_CREDIT_BUREAU_MON.by_group.loc[0]), 
                           "1" : round(100*sr_AMT_REQ_CREDIT_BUREAU_MON.by_group.loc[1])}
#                   "Unknown" : round(100*sr_AMT_REQ_CREDIT_BUREAU_MON.by_group.loc[0.006764374295377677])}

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(AMT_REQ_CREDIT_BUREAU_MON_groups.keys(),AMT_REQ_CREDIT_BUREAU_MON_groups.values(), width=0.4, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs AMT_REQ_CREDIT_BUREAU_MON")
plt.xlabel('AMT_REQ_CREDIT_BUREAU_MON')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for AMT_REQ_CREDIT_BUREAU_MON is ", max(AMT_REQ_CREDIT_BUREAU_MON_groups.values()) - min(AMT_REQ_CREDIT_BUREAU_MON_groups.values()))

AMT_REQ_CREDIT_BUREAU_MON_dp = round(min(AMT_REQ_CREDIT_BUREAU_MON_groups.values()) / max(AMT_REQ_CREDIT_BUREAU_MON_groups.values()),2)
print("The dp ratio for AMT_REQ_CREDIT_BUREAU_MON is ", AMT_REQ_CREDIT_BUREAU_MON_dp)

dp_ratio_features.update({"AMT_REQ_CREDIT_BUREAU_MON": AMT_REQ_CREDIT_BUREAU_MON_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 48. AMT_REQ_CREDIT_BUREAU_QRT

In [None]:
test_df["AMT_REQ_CREDIT_BUREAU_QRT"].min(), test_df["AMT_REQ_CREDIT_BUREAU_QRT"].max()

In [None]:
# <=2  = [0,1]
# >2 = 

In [None]:
sr_AMT_REQ_CREDIT_BUREAU_QRT = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df_final["AMT_REQ_CREDIT_BUREAU_QRT"])
display(sr_AMT_REQ_CREDIT_BUREAU_QRT.by_group)

In [None]:
sr_AMT_REQ_CREDIT_BUREAU_QRT.by_group.keys()

In [None]:
lessthan_2 = sr_AMT_REQ_CREDIT_BUREAU_QRT.by_group.loc[[0,1]]
gt_2 = sr_AMT_REQ_CREDIT_BUREAU_QRT.by_group.loc[2:]
# unknown = sr_AMT_REQ_CREDIT_BUREAU_QRT.by_group.loc[0.5298759864712514]

lessthan_2_risk_rate = 100*round(np.mean(lessthan_2),2)
gt_2_risk_rate = 100*round(np.mean(gt_2),2)
# unknown_risk_rate = 100*round(unknown,2)


AMT_REQ_CREDIT_BUREAU_QRT_risk_groups= {"<=2" : round(lessthan_2_risk_rate), ">2" : gt_2_risk_rate}
AMT_REQ_CREDIT_BUREAU_QRT_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(AMT_REQ_CREDIT_BUREAU_QRT_risk_groups.keys(),AMT_REQ_CREDIT_BUREAU_QRT_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs AMT_REQ_CREDIT_BUREAU_QRT")
plt.xlabel('AMT_REQ_CREDIT_BUREAU_QRT')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for AMT_REQ_CREDIT_BUREAU_QRT is ", max(AMT_REQ_CREDIT_BUREAU_QRT_risk_groups.values()) - min(AMT_REQ_CREDIT_BUREAU_QRT_risk_groups.values()))

AMT_REQ_CREDIT_BUREAU_QRT_dp = round(min(AMT_REQ_CREDIT_BUREAU_QRT_risk_groups.values()) / max(AMT_REQ_CREDIT_BUREAU_QRT_risk_groups.values()),2)
print("The dp ratio for AMT_REQ_CREDIT_BUREAU_QRT is ", AMT_REQ_CREDIT_BUREAU_QRT_dp)

dp_ratio_features.update({"AMT_REQ_CREDIT_BUREAU_QRT":AMT_REQ_CREDIT_BUREAU_QRT_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

#### 49. AMT_REQ_CREDIT_BUREAU_YEAR

In [None]:
test_df["AMT_REQ_CREDIT_BUREAU_YEAR"].min(), test_df["AMT_REQ_CREDIT_BUREAU_YEAR"].max()

In [None]:
# <=2  = [0,1]
# 3-5
# >5 

In [None]:
sr_AMT_REQ_CREDIT_BUREAU_YEAR = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df_final["AMT_REQ_CREDIT_BUREAU_YEAR"])
display(sr_AMT_REQ_CREDIT_BUREAU_YEAR.by_group)

In [None]:
sr_AMT_REQ_CREDIT_BUREAU_YEAR.by_group.keys()

In [None]:
lessthan_2 = sr_AMT_REQ_CREDIT_BUREAU_YEAR.by_group.loc[[0,1]]
bt_3_5 = sr_AMT_REQ_CREDIT_BUREAU_YEAR.by_group.loc[3:5]
gt_5 = sr_AMT_REQ_CREDIT_BUREAU_YEAR.by_group.loc[6:]
# unknown = sr_AMT_REQ_CREDIT_BUREAU_YEAR.by_group.loc[1.9526493799323563]

lessthan_2_risk_rate = 100*round(np.mean(lessthan_2),2)
bt_3_5_risk_rate = 100*round(np.mean(bt_3_5),2)
gt_5_risk_rate = 100*round(np.mean(gt_5),2)
# unknown_risk_rate = 100*round(unknown,2)


sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups= {"<=2" : lessthan_2_risk_rate, "3-5": bt_3_5_risk_rate,
                                            
                                            ">5" : gt_5_risk_rate}
sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

bars = ax.bar(sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups.keys(),sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups.values(), width=0.5, color = "green")
ax.bar_label(bars)
plt.title("Risk Rate vs AMT_REQ_CREDIT_BUREAU_YEAR")
plt.xlabel('AMT_REQ_CREDIT_BUREAU_YEAR')
plt.ylabel('Risk Rate in %')
plt.show()

In [None]:
print("The dp difference for AMT_REQ_CREDIT_BUREAU_YEAR is ", max(sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups.values()) - min(sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups.values()))

AMT_REQ_CREDIT_BUREAU_YEAR_dp = round(min(sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups.values()) / max(sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups.values()),2)
print("The dp ratio for AMT_REQ_CREDIT_BUREAU_YEAR is ", AMT_REQ_CREDIT_BUREAU_YEAR_dp)

dp_ratio_features.update({"AMT_REQ_CREDIT_BUREAU_YEAR":AMT_REQ_CREDIT_BUREAU_YEAR_dp})

In [None]:
display(dp_ratio_features)
len(dp_ratio_features)

In [None]:
dp_ratio_features_df = pd.DataFrame()
dp_ratio_features_df["Features"] = dp_ratio_features.keys()
dp_ratio_features_df["DP_Ratio"] = dp_ratio_features.values()
dp_ratio_features_df

In [None]:
# ft_vd.to_csv(folder+"data/processed_data/Value_Distributions.csv")

In [None]:
# ft_vd

In [None]:
# ft_df["REGION_POPULATION_RELATIVE"]

In [None]:
# a = len(ft_df["AMT_ANNUITY"]["SR"])
# a_3 = int(a/3)
# v_list = ft_df["AMT_ANNUITY"]["SR"].values.tolist()
# i_list = ft_df["AMT_ANNUITY"]["SR"].index.tolist()
# v_list_1 = ft_df["AMT_ANNUITY"]["SR"].values.tolist()[0:a_3]
# v_list_2 = ft_df["AMT_ANNUITY"]["SR"].values.tolist()[a_3:a_3+a_3]
# v_list_3 = ft_df["AMT_ANNUITY"]["SR"].values.tolist()[a_3+a_3:]
# print(np.mean(v_list_1))

### Indegree and Outdegree
These are measured based on the causal analysis presented in CausalAnalysis.ipynb

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
am = pd.read_csv(folder+"data/processed_data/adjacency_matrix_TestSet.csv")
edges = pd.read_csv(folder+'data/processed_data/Edges_TestSet.csv')
nodes = pd.read_csv(folder+'data/processed_data/Nodes_TestSet.csv')

In [None]:
am.head()

In [None]:
am.iloc[:,1:].columns

In [None]:
am.head()

In [None]:
## checks
am.set_index("Edge_Start",inplace = True)
np.sum(am.loc["CNT_CHILDREN"],0)

In [None]:
## checks
df = {}

for i in test_df.iloc[:,4:].columns:
    df.update({i + "_indegree":round(np.sum(am[i]),2)})
    df.update({i + "_outdegree":round(np.sum(am.loc[i]),2)})
    
# df

### Final Composition of Data to be shown to Participants

In [None]:
test_df['NAME_CONTRACT_TYPE'].unique()

In [None]:
dj_df = pd.DataFrame()

dj_df['Application_id'] = test_df['SK_ID_CURR']
dj_df['Predicted_decision'] = test_df['Predicted_Result']
dj_df['Prediction_Confidence_Accepted'] = test_df['Prediction_Confidence_Accepted']
dj_df['Prediction_Confidence_Rejected'] = test_df['Prediction_Confidence_Rejected']

#Ft_values
for index,value in enumerate(test_df.columns[4:]):

    dj_df["ft_name_" + str(index)] = test_df[value].name
    
    dj_df["ft_value_" + str(index)] = test_df[value].values

    dj_df["ft_weight_" + str(index)] = round(float(ft_df[value].loc[1]),3)
    
    dj_df["ft_indegree_" + str(index)] = round(np.sum(am[value]),2)
    
    dj_df["ft_outdegree_" + str(index)] = round(np.sum(am.loc[value]),2)
    
    dj_df["ft_dp_ratio_" + str(index)] = dp_ratio_features[value]
    
#     print(value)

dj_df

In [None]:
# dj_df.to_csv(folder+"data/processed_data/Applications.csv")

### Value Distributions

In [None]:
test_df.columns

In [None]:
feature_groups = {}
feature_groups.update({'NAME_CONTRACT_TYPE':contract_type_risk_groups, 'CODE_GENDER':gender_risk_groups,
       'FLAG_OWN_CAR':car_risk_groups, 'FLAG_OWN_REALTY':realty_risk_groups, 'CNT_CHILDREN':cnt_children_risk_groups, 
        'AMT_INCOME_TOTAL':income_risk_groups,'AMT_CREDIT':credit_risk_groups, 
        'AMT_ANNUITY':annunity_risk_groups, 'AMT_GOODS_PRICE':goods_risk_groups, 
        'NAME_TYPE_SUITE':type_suite_groups,'NAME_INCOME_TYPE':income_type_groups, 'NAME_EDUCATION_TYPE':education_type_groups, 'NAME_FAMILY_STATUS':family_groups,
       'NAME_HOUSING_TYPE':housing_groups, 'REGION_POPULATION_RELATIVE':region_population_risk_groups, 'AGE':age_risk_groups,
       'YEARS_EMPLOYED':years_employed_risk_groups, 'YEARS_REGISTRATION':years_employed_risk_groups, 'YEARS_ID_PUBLISH':id_publish_risk_groups,
       'FLAG_MOBIL':{"Yes":32.0}, 'FLAG_EMP_PHONE':emp_mobile_groups, 'FLAG_WORK_PHONE':work_phone_groups, 'FLAG_CONT_MOBILE':cont_mobile_groups,
       'FLAG_PHONE':phone_groups, 'FLAG_EMAIL':email_groups, 'OCCUPATION_TYPE':occupation_groups, 'CNT_FAM_MEMBERS':family_members_risk_groups,
       'REGION_RATING_CLIENT':region_rating_groups, 'REGION_RATING_CLIENT_W_CITY':region_rating_w_city_groups,
       'WEEKDAY_APPR_PROCESS_START':weekday_groups, 'HOUR_APPR_PROCESS_START':hour_application_risk_groups,
       'REG_REGION_NOT_LIVE_REGION':region_live_groups, 'REG_REGION_NOT_WORK_REGION':region_work_groups,
       'LIVE_REGION_NOT_WORK_REGION':live_work_groups, 'REG_CITY_NOT_LIVE_CITY':city_live_groups,
       'REG_CITY_NOT_WORK_CITY':city_work_groups, 'LIVE_CITY_NOT_WORK_CITY':live_work_city_groups,
       'ORGANIZATION_TYPE':organisation_groups, 'OBS_30_CNT_SOCIAL_CIRCLE':OBS_30_risk_groups,
       'DEF_30_CNT_SOCIAL_CIRCLE':DEF_30_risk_groups, 'OBS_60_CNT_SOCIAL_CIRCLE':OBS_60_risk_groups,
       'DEF_60_CNT_SOCIAL_CIRCLE':DEF_60_risk_groups, 'YEARS_LAST_PHONE_CHANGE':phone_change_risk_groups,
       'AMT_REQ_CREDIT_BUREAU_HOUR':AMT_REQ_CREDIT_BUREAU_HOUR_groups, 'AMT_REQ_CREDIT_BUREAU_DAY':AMT_REQ_CREDIT_BUREAU_DAY_groups,
       'AMT_REQ_CREDIT_BUREAU_WEEK':AMT_REQ_CREDIT_BUREAU_WEEK_groups, 'AMT_REQ_CREDIT_BUREAU_MON':AMT_REQ_CREDIT_BUREAU_MON_groups,
       'AMT_REQ_CREDIT_BUREAU_QRT':AMT_REQ_CREDIT_BUREAU_QRT_risk_groups, 'AMT_REQ_CREDIT_BUREAU_YEAR':sr_AMT_REQ_CREDIT_BUREAU_YEAR_risk_groups})

In [None]:
dp_feature_bins_df = pd.DataFrame()
dp_feature_bins_df["Features"] = feature_groups.keys()
dp_feature_bins_df["Groups"] = feature_groups.values()
# dp_feature_bins_df.to_csv(folder+"data/processed_data/Value_Distributions.csv")

### Feature Combinations

#### Binning

In [None]:
test_df_fc = test_df.iloc[:,4:]

In [None]:
test_df_fc.columns

In [None]:
# dp_feature_bins_df.set_index('Features',inplace = True)
# dp_feature_bins_df.loc["AMT_ANNUITY"].values

In [None]:
test_df["AMT_ANNUITY"].max()

In [None]:
# import warnings
# from pandas.core.common import SettingWithCopyWarning

# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
test_df_fc = test_df.iloc[:,4:]

test_df_fc['CNT_CHILDREN'].loc[test_df_fc['CNT_CHILDREN'] <= 2] = 10
test_df_fc['CNT_CHILDREN'].loc[(test_df_fc['CNT_CHILDREN'] >2) & (test_df_fc['CNT_CHILDREN'] <5)] = 20
test_df_fc['CNT_CHILDREN'].loc[(test_df_fc['CNT_CHILDREN'] >5) & (test_df_fc['CNT_CHILDREN'] <= 8)] = 30

test_df_fc['AMT_INCOME_TOTAL'].loc[test_df_fc['AMT_INCOME_TOTAL'] <= 100000] = 1
test_df_fc['AMT_INCOME_TOTAL'].loc[(test_df_fc['AMT_INCOME_TOTAL'] >100000) & (test_df_fc['AMT_INCOME_TOTAL'] < 150000)] = 2
test_df_fc['AMT_INCOME_TOTAL'].loc[(test_df_fc['AMT_INCOME_TOTAL'] >150000) & (test_df_fc['AMT_INCOME_TOTAL'] < 200000)] = 3
test_df_fc['AMT_INCOME_TOTAL'].loc[(test_df_fc['AMT_INCOME_TOTAL'] >200000) & (test_df_fc['AMT_INCOME_TOTAL'] <= 1350000)] = 4

test_df_fc['AMT_CREDIT'].loc[test_df_fc['AMT_CREDIT'] <= 250000] = 1
test_df_fc['AMT_CREDIT'].loc[(test_df_fc['AMT_CREDIT'] >250000) & (test_df_fc['AMT_CREDIT'] < 500000)] = 2
test_df_fc['AMT_CREDIT'].loc[(test_df_fc['AMT_CREDIT'] >500000) & (test_df_fc['AMT_CREDIT'] < 750000)] = 3
test_df_fc['AMT_CREDIT'].loc[(test_df_fc['AMT_CREDIT'] >750000) & (test_df_fc['AMT_CREDIT'] <= 1000000)] = 4
test_df_fc['AMT_CREDIT'].loc[(test_df_fc['AMT_CREDIT'] >1000000) & (test_df_fc['AMT_CREDIT'] <= 2156400)] = 5

test_df_fc['AMT_ANNUITY'].loc[test_df_fc['AMT_ANNUITY'] <= 10000] = 1
test_df_fc['AMT_ANNUITY'].loc[(test_df_fc['AMT_ANNUITY'] >10000) & (test_df_fc['AMT_ANNUITY'] < 25000)] = 2
test_df_fc['AMT_ANNUITY'].loc[(test_df_fc['AMT_ANNUITY'] >25000) & (test_df_fc['AMT_ANNUITY'] < 50000)] = 3
test_df_fc['AMT_ANNUITY'].loc[(test_df_fc['AMT_ANNUITY'] >50000) & (test_df_fc['AMT_ANNUITY'] <= 103455)] = 4

test_df_fc['AMT_GOODS_PRICE'].loc[test_df_fc['AMT_GOODS_PRICE'] <= 100000] = 1
test_df_fc['AMT_GOODS_PRICE'].loc[(test_df_fc['AMT_GOODS_PRICE'] >100000) & (test_df_fc['AMT_GOODS_PRICE'] < 500000)] = 2
test_df_fc['AMT_GOODS_PRICE'].loc[(test_df_fc['AMT_GOODS_PRICE'] >500000) & (test_df_fc['AMT_GOODS_PRICE'] < 1000000)] = 3
test_df_fc['AMT_GOODS_PRICE'].loc[(test_df_fc['AMT_GOODS_PRICE'] >1000000) & (test_df_fc['AMT_GOODS_PRICE'] <= 1800000)] = 4

test_df_fc['REGION_POPULATION_RELATIVE'].loc[test_df_fc['REGION_POPULATION_RELATIVE'] <= 0.01] = 1
test_df_fc['REGION_POPULATION_RELATIVE'].loc[(test_df_fc['REGION_POPULATION_RELATIVE'] >0.01) & (test_df_fc['REGION_POPULATION_RELATIVE'] <=0.02)] = 2
test_df_fc['REGION_POPULATION_RELATIVE'].loc[(test_df_fc['REGION_POPULATION_RELATIVE'] >0.02) & (test_df_fc['REGION_POPULATION_RELATIVE'] <= 0.03)] = 3
test_df_fc['REGION_POPULATION_RELATIVE'].loc[(test_df_fc['REGION_POPULATION_RELATIVE'] >0.03) & (test_df_fc['REGION_POPULATION_RELATIVE'] <0.08)] = 4

test_df_fc['AGE'].loc[test_df_fc['AGE'] <= 25] = 1
test_df_fc['AGE'].loc[(test_df_fc['AGE'] >25) & (test_df_fc['AGE'] <=40)] = 2
test_df_fc['AGE'].loc[(test_df_fc['AGE'] >40) & (test_df_fc['AGE'] <= 60)] = 3
test_df_fc['AGE'].loc[(test_df_fc['AGE'] >60)] = 4

test_df_fc['YEARS_EMPLOYED'].loc[test_df_fc['YEARS_EMPLOYED'] <= 10] = 1
test_df_fc['YEARS_EMPLOYED'].loc[(test_df_fc['YEARS_EMPLOYED'] >10) & (test_df_fc['YEARS_EMPLOYED'] <=20)] = 2
test_df_fc['YEARS_EMPLOYED'].loc[(test_df_fc['YEARS_EMPLOYED'] >20) & (test_df_fc['YEARS_EMPLOYED'] <= 30)] = 3
test_df_fc['YEARS_EMPLOYED'].loc[(test_df_fc['YEARS_EMPLOYED'] >30)] = 4

test_df_fc['YEARS_REGISTRATION'].loc[test_df_fc['YEARS_REGISTRATION'] <= 10] = 1
test_df_fc['YEARS_REGISTRATION'].loc[(test_df_fc['YEARS_REGISTRATION'] >10) & (test_df_fc['YEARS_REGISTRATION'] <=20)] = 2
test_df_fc['YEARS_REGISTRATION'].loc[(test_df_fc['YEARS_REGISTRATION'] >20) & (test_df_fc['YEARS_REGISTRATION'] <= 30)] = 3
test_df_fc['YEARS_REGISTRATION'].loc[(test_df_fc['YEARS_REGISTRATION'] >30)] = 4

test_df_fc['YEARS_ID_PUBLISH'].loc[test_df_fc['YEARS_ID_PUBLISH'] <= 5] = 1
test_df_fc['YEARS_ID_PUBLISH'].loc[(test_df_fc['YEARS_ID_PUBLISH'] >5) & (test_df_fc['YEARS_ID_PUBLISH'] <= 10)] = 2
test_df_fc['YEARS_ID_PUBLISH'].loc[(test_df_fc['YEARS_ID_PUBLISH'] >10)] = 3

test_df_fc['CNT_FAM_MEMBERS'].loc[test_df_fc['CNT_FAM_MEMBERS'] <= 2] = 1
test_df_fc['CNT_FAM_MEMBERS'].loc[(test_df_fc['CNT_FAM_MEMBERS'] >2) & (test_df_fc['CNT_FAM_MEMBERS'] <= 5)] = 2
test_df_fc['CNT_FAM_MEMBERS'].loc[(test_df_fc['CNT_FAM_MEMBERS'] >5)] = 3

test_df_fc['HOUR_APPR_PROCESS_START'].loc[test_df_fc['HOUR_APPR_PROCESS_START'] <= 8] = 1
test_df_fc['HOUR_APPR_PROCESS_START'].loc[(test_df_fc['HOUR_APPR_PROCESS_START'] >8) & (test_df_fc['HOUR_APPR_PROCESS_START'] <=13)] = 2
test_df_fc['HOUR_APPR_PROCESS_START'].loc[(test_df_fc['HOUR_APPR_PROCESS_START'] >13) & (test_df_fc['HOUR_APPR_PROCESS_START'] <= 17)] = 3
test_df_fc['HOUR_APPR_PROCESS_START'].loc[(test_df_fc['HOUR_APPR_PROCESS_START'] >17)] = 4


test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(['Kindergarten', 'School' , 'University'])] = 1
test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(["Industry: type 1", "Industry: type 11", "Industry: type 12", "Industry: type 3", "Industry: type 4", "Industry: type 5", "Industry: type 7", "Industry: type 9"])] = 2
test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(["Business Entity Type 1","Business Entity Type 2","Business Entity Type 3"])] = 3
test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(["Trade: type 2","Trade: type 3","Trade: type 6","Trade: type 7"])] = 4
test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(["Transport: type 1","Transport: type 2","Transport: type 3", "Transport: type 4"])] = 5
test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(["Government", "Housing", "Military", "Police", "Postal", 'Security Ministries', "Medicine"])] = 6
test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(['Advertising', 'Security', "Agriculture", "Electricity", "Hotel", "Mobile", "Restaurant", "Self-employed", "Telecom"])] = 7
test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(["Other", "Services", "Bank", "Construction", "Emergency", "Legal Services"])] = 8
test_df_fc['ORGANIZATION_TYPE'].loc[test_df_fc['ORGANIZATION_TYPE'].isin(["XNA"])] = 9

test_df_fc['OBS_30_CNT_SOCIAL_CIRCLE'].loc[test_df_fc['OBS_30_CNT_SOCIAL_CIRCLE'] <= 5] = 1
test_df_fc['OBS_30_CNT_SOCIAL_CIRCLE'].loc[(test_df_fc['OBS_30_CNT_SOCIAL_CIRCLE'] >5) & (test_df_fc['OBS_30_CNT_SOCIAL_CIRCLE'] <10)] = 2
test_df_fc['OBS_30_CNT_SOCIAL_CIRCLE'].loc[(test_df_fc['OBS_30_CNT_SOCIAL_CIRCLE'] >= 10)] = 3

test_df_fc['DEF_30_CNT_SOCIAL_CIRCLE'].loc[test_df_fc['DEF_30_CNT_SOCIAL_CIRCLE'] <2] = 1
test_df_fc['DEF_30_CNT_SOCIAL_CIRCLE'].loc[(test_df_fc['DEF_30_CNT_SOCIAL_CIRCLE'] >= 2)] = 2

test_df_fc['OBS_60_CNT_SOCIAL_CIRCLE'].loc[test_df_fc['OBS_60_CNT_SOCIAL_CIRCLE'] <= 5] = 1
test_df_fc['OBS_60_CNT_SOCIAL_CIRCLE'].loc[(test_df_fc['OBS_60_CNT_SOCIAL_CIRCLE'] >5) & (test_df_fc['OBS_60_CNT_SOCIAL_CIRCLE'] <10)] = 2
test_df_fc['OBS_60_CNT_SOCIAL_CIRCLE'].loc[(test_df_fc['OBS_60_CNT_SOCIAL_CIRCLE'] >= 10)] = 3

test_df_fc['DEF_60_CNT_SOCIAL_CIRCLE'].loc[test_df_fc['DEF_60_CNT_SOCIAL_CIRCLE'] <2] = 1
test_df_fc['DEF_60_CNT_SOCIAL_CIRCLE'].loc[(test_df_fc['DEF_60_CNT_SOCIAL_CIRCLE'] >= 2)] = 2

test_df_fc['YEARS_LAST_PHONE_CHANGE'].loc[test_df_fc['YEARS_LAST_PHONE_CHANGE'] <= 2] = 1
test_df_fc['YEARS_LAST_PHONE_CHANGE'].loc[(test_df_fc['YEARS_LAST_PHONE_CHANGE'] >2) & (test_df_fc['YEARS_LAST_PHONE_CHANGE'] <= 5)] = 2
test_df_fc['YEARS_LAST_PHONE_CHANGE'].loc[(test_df_fc['YEARS_LAST_PHONE_CHANGE'] >5)] = 3

test_df_fc['AMT_REQ_CREDIT_BUREAU_QRT'].loc[test_df_fc['AMT_REQ_CREDIT_BUREAU_QRT'] <= 2] = 1
test_df_fc['AMT_REQ_CREDIT_BUREAU_QRT'].loc[(test_df_fc['AMT_REQ_CREDIT_BUREAU_QRT'] > 2)] = 2

test_df_fc['AMT_REQ_CREDIT_BUREAU_YEAR'].loc[test_df_fc['AMT_REQ_CREDIT_BUREAU_YEAR'] <= 2] = 1
test_df_fc['AMT_REQ_CREDIT_BUREAU_YEAR'].loc[(test_df_fc['AMT_REQ_CREDIT_BUREAU_YEAR'] >2) & (test_df_fc['AMT_REQ_CREDIT_BUREAU_YEAR'] <=5)] = 2
test_df_fc['AMT_REQ_CREDIT_BUREAU_YEAR'].loc[(test_df_fc['AMT_REQ_CREDIT_BUREAU_YEAR'] >= 5)] = 3

In [None]:
test_df_fc.replace({'CNT_CHILDREN' : { 10 :'0-2', 20 : '3-5', 30: '5-10'},
                'AMT_INCOME_TOTAL' : { 1 :'<100K', 2 : '100K-150K',3:"150K-200K",4:"200K-250K", 5:">250K" },
                'AMT_CREDIT' : { 1 :'<250k', 2 : '250K-500K',3:"500K-750K" ,4:"750K-1M", 5:">1M"},
                'AMT_ANNUITY' : { 1 :'<10K', 2 : '10K-25K',3:"25K-50K",4:">50K"},
                'AMT_GOODS_PRICE' : { 1 :'<100K', 2 : '100K-500K',3:"500K-1M" ,4:">1M" },
                'REGION_POPULATION_RELATIVE' : { 1 :'<0.01 ', 2 : '0.01-0.02',3:"0.02-0.03" ,4:">0.03" },
                'AGE' : { 1 :'Young Adults =<25', 2 : 'Adults =26-40',3:"Middle-Aged Adults = 41 - 60",4:"Older Adults =60+"},
                'YEARS_EMPLOYED' : { 1 :'<10', 2 : '10–20',3:"20–30",4:">30"},
                'YEARS_REGISTRATION' : { 1 :'<10', 2 : '10–20',3:"20–30",4:">30"},
                'YEARS_ID_PUBLISH' : { 1 :'<5', 2 : '5-10',3:">10"},
                'CNT_FAM_MEMBERS' : { 1 :'<=2', 2 : '3-5',3:">=5"},
                'HOUR_APPR_PROCESS_START' : { 1 :'Midnight to 9 am (Before Office Hours) : 0-9', 2 : '9am to 1 pm (Office Hours: Morning): 9-13',3:"1pm to 5 pm(Office Hours:Afternoon): 13-17",4:'5pm - Midnight (After Office Hours): 17-24'},
                'ORGANIZATION_TYPE' : { 1 :'Education', 2 : 'Industry',3:"Business Entity",4:"Trade", 5: "Transport", 6:"Public Sector", 7: "Private Sector",8:"Others",9:"XNA"},
                'OBS_30_CNT_SOCIAL_CIRCLE' : { 1 :'<=5', 2 : '6-9',3: ">=10"},
                'DEF_30_CNT_SOCIAL_CIRCLE' : { 1 :'0-2', 2 : '>=2'},
                'OBS_60_CNT_SOCIAL_CIRCLE' : { 1 :'<=5', 2 : '6-9',3:">=10"},
                'DEF_60_CNT_SOCIAL_CIRCLE' : { 1 :'0-2', 2 : '>=2'},
                'YEARS_LAST_PHONE_CHANGE' : { 1 :'<=2', 2 : '2-5',3:">5"},
                'AMT_REQ_CREDIT_BUREAU_QRT' : { 1 :'<= 2', 2 : '>2'},
                'AMT_REQ_CREDIT_BUREAU_YEAR' : { 1 :'<= 2', 2 : '3-5',3: ">5"}},inplace=True)

In [None]:
test_df_fc["NAME_TYPE_SUITE"].unique()

In [None]:
test_df_fc.replace({'NAME_TYPE_SUITE' : { "Other_A" : 'Others', "Other_B":  'Others'}},inplace=True)
# test_df_fc["NAME_TYPE_SUITE"].replace({'CNT_CHILDREN' : { 10 :'0-2', 20 : '3-5', 30: '5-10'},
#                 'AMT_INCOME_TOTAL' : { 1 :'<100K', 2 : '100K-150K',3:"150K-200K",4:"200K-250K", 5:">250K" })

In [None]:
# df = test_df.select_dtypes(exclude=["number"])
for col in test_df_fc.columns:
    print(col)
    print(test_df_fc[col].unique())

#### Any feature values have less than 10 instances 

In [None]:
test_df_fc.fillna("NA", inplace = True)

test_df_fc["Predictions"] = test_df["Predicted_Result"]
test_df_fc.replace({'Predictions' : { 'Accepted' : 1, 'Rejected': 0}},inplace=True)
test_df_fc.head()

In [None]:
feature_Descriptions_Display = pd.read_csv(folder+'data/processed_data/Feature_Descriptions_Display.csv', delimiter=',')
feature_Descriptions_Display.head()

In [None]:
# Repalcing the feature Values
feature_list = feature_Descriptions_Display["Display name"].tolist()
feature_list.append("Predictions")

test_df_fc.columns = feature_list
test_df_fc
# test_df_fc = test_df_fc.replace(feature_Descriptions_Display["Column"].tolist(),feature_Descriptions_Display["Display name"].tolist())

In [None]:
fc = {}
fc_df = pd.DataFrame()
fc["Combinations"] = []
fc["Values"] = []
fc["Counts"] = []
fc["Selection_Rate"] = []

column_list = test_df_fc.columns.tolist()
col_use = [column_list[0]]
for i in range(1,len(column_list)):
    col_use.extend(column_list[i:i+1])
    gr = test_df_fc.groupby(by=col_use)
    
    for i, group in gr:
        if len(group)>10: 
            predictions_fc = group['Predictions']
            group = group[col_use]
            display(group)
            risk_rate = MetricFrame(metrics=selection_rate, y_true=predictions_fc, y_pred=predictions_fc,sensitive_features=group)
            
            fc["Combinations"] = [tuple(list(risk_rate.by_group.keys().names))]
            fc["Values"] = risk_rate.by_group.keys().tolist()
            fc["Counts"] = [len(group)]
            fc["Selection_Rate"] = [100*round(risk_rate.by_group.values[0],2)]
            # fc_df = fc_df.append(pd.Series(fc), ignore_index=True)     
            fc_df = pd.concat([fc_df, pd.DataFrame(fc)], ignore_index=True)

In [None]:
fc_df = fc_df.sort_values(by=['Selection_Rate'], ascending=False)
fc_df.reset_index(inplace = True)
fc_df

In [None]:
# fc_df.to_csv(folder+"data/processed_data/Feature Combinations_Converted.csv")

In [None]:
# fc_df.to_csv(folder+"data/processed_data/Feature Combinations.csv")

### Feature Combinations

In [None]:
test_df["CNT_CHILDREN"].value_counts()

In [None]:
test_df.iloc[:,4:].columns

In [None]:
# dp_feature_bins_df.loc["CNT_CHILDREN"]

In [None]:
# print(len(test_df[(test_df["NAME_CONTRACT_TYPE"] == "Fixed") & (test_df["CODE_GENDER"] == "Female") & (test_df["Predicted_Result"] == "Rejected")]))
# print(len(test_df[(test_df["NAME_CONTRACT_TYPE"] == "Fixed") & (test_df["CODE_GENDER"] == "Female") & (test_df["Predicted_Result"] == "Accepted")]))

In [None]:
from fairlearn.metrics import count
sr_check = MetricFrame(metrics=selection_rate, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df.iloc[:,4:7])
display(sr_check.by_group)
counts = MetricFrame(metrics=count, y_true=predictions_test, y_pred=predictions_test,sensitive_features=test_df.iloc[:,4:7])
display(counts.by_group)

In [None]:
# display(dp_feature_bins_df.loc["NAME_CONTRACT_TYPE"])
# display(dp_feature_bins_df.loc["CODE_GENDER"])

### Comparison of Different Applications

In [None]:
test_df_final

In [None]:
# an 100x10 array of features, one row for each of the 100 applications

# L2 changed to L1

applicant_ft = np.array(test_df_final)

#A list of all application ids
application_ids = np.array(test_df["SK_ID_CURR"])
    
each_applicant_l2 = np.zeros((1000,1000))

for i in range(1000):    
    appid = applicant_ft[application_ids.searchsorted(application_ids[i])]

    appid_l2 = np.linalg.norm(applicant_ft - appid, 1, axis = 1 )  # L_1
    each_applicant_l2[i] = appid_l2
    
#Due to huge distances, we need to normalise our data to bring the distance between 0 to 1. We do this by formula : -
# # normalized_df=(df-df.min())/(df.max()-df.min())

normalized_each_applicant_l2=(each_applicant_l2-each_applicant_l2.min())/(each_applicant_l2.max()-each_applicant_l2.min())

# To convert distance measure to similarity measure, we use the formula :-
# Similarity = 1 / (difference + 1)

similarity_matrix = np.zeros((1000,1000))
for i in range(1000):
    similarity = 1/ (normalized_each_applicant_l2[i] + 1)
    similarity_matrix[i] = 100*similarity
    
similarity_df = pd.DataFrame(data = similarity_matrix, columns = application_ids, index=application_ids)
similarity_df = similarity_df.round(2)
similarity_df


# similarity_df.to_csv(folder+"data/processed_data/Comparison_Different_Applications.csv")

### Comparison of Different Features

In [None]:
test_df_features = test_df_final.copy()
test_df_features

test_df_features.rename(columns={'CODE_GENDER_LE':'CODE_GENDER','NAME_CONTRACT_TYPE_LE': 'NAME_CONTRACT_TYPE',
                             'FLAG_OWN_CAR_LE':"FLAG_OWN_CAR" , 'FLAG_OWN_REALTY_LE':'FLAG_OWN_REALTY', 
                              'NAME_TYPE_SUITE_LE':"NAME_TYPE_SUITE",'NAME_INCOME_TYPE_LE':"NAME_INCOME_TYPE",
                              
                              'NAME_EDUCATION_TYPE_LE': "NAME_EDUCATION_TYPE",'NAME_FAMILY_STATUS_LE':"NAME_FAMILY_STATUS",
                              'NAME_HOUSING_TYPE_LE':"NAME_HOUSING_TYPE", 'OCCUPATION_TYPE_LE':"OCCUPATION_TYPE",
                              'WEEKDAY_APPR_PROCESS_START_LE':"WEEKDAY_APPR_PROCESS_START",'ORGANIZATION_TYPE_LE':"ORGANIZATION_TYPE"
                             }, inplace=True)


test_df_features = test_df_features.reindex(test_df.columns[4:], axis=1)

# test_df_features.isnull().any()

In [None]:
test_df_features

In [None]:
applicant_ft = np.array(test_df_features)
application_ids = np.array(test_df["SK_ID_CURR"])

feature_similarity_tensor = np.zeros((1000,1000,49))

for i in range(1000):
    
#     Pick the 49 features
    appid = applicant_ft[application_ids.searchsorted(application_ids[i])]
    
    ft_distances = abs(applicant_ft - appid)

    ft_similarity = 1/ (ft_distances + 1)
    
#     ft_similarity[ft_similarity == 0.5] = 0
    
    feature_similarity_tensor[i] = ft_similarity
    
feature_similarity_tensor = 100*feature_similarity_tensor
feature_similarity_tensor = feature_similarity_tensor.round(2)

In [None]:
# feature_similarity_matrix

In [None]:
# pip install xarray

In [None]:
import xarray as xr
feature_similarities = xr.DataArray(name='Feature_similarities', data = feature_similarity_tensor, dims=['Application_1', 'Application_2', 'Features'])
feature_similarities["Application_1"] = application_ids
feature_similarities["Application_2"] = application_ids
feature_similarities['Features'] = test_df_features.columns
feature_similarities_df = feature_similarities.to_dataframe().unstack()

In [None]:
feature_similarities_df

In [None]:
# feature_similarities_df.to_csv(folder+"data/processed_data/Feature Similarities.csv")

In [None]:
feature_similarities = xr.DataArray(name='Feature_similarities', data = feature_similarity_tensor, dims=['Application_1', 'Application_2', 'Features'])
feature_similarities["Application_1"] = application_ids
feature_similarities["Application_2"] = application_ids
feature_similarities['Features'] = feature_Descriptions_Display["Display name"].tolist()
feature_similarities_df = feature_similarities.to_dataframe().unstack()

In [None]:
feature_similarities_df.to_csv(folder+"data/processed_data/Feature Similarities_Converted.csv")

### Feature Descriptions to Display

In [None]:
from docx import Document
document = Document("Feature Descriptions Display 15112022.docx")

table = document.tables[0]

data = [[cell.text for cell in row.cells] for row in table.rows]
feature_description = pd.DataFrame(data)

feature_description = feature_description.rename(columns=feature_description.iloc[0]).drop(feature_description.index[0]).reset_index(drop=True)

In [None]:
feature_description

In [None]:
feature_description.to_csv(folder+"data/processed_data/Feature_Descriptions_Display.csv")

In [None]:
# test_df.to_csv(folder+"data/processed_data/Model_Predictions.csv")