In [None]:
# !pip install aif360
# !pip install fairlearn
# !pip install 'aif360[LawSchoolGPA]'
# !pip install BlackBoxAuditing

### Loading and preparing the dataset

In [None]:
# First, read-in the data and check for null values
import numpy as np
import pandas as pd
import aif360
from aif360.algorithms.preprocessing import DisparateImpactRemover
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
pd.options.mode.chained_assignment = None  # default='warn', silencing Setting With Copy warning
df = pd.read_csv('/content/credit_risk.csv')
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777.0,113.0,360.0,1.0,Urban,Y
977,LP002975,Male,Yes,0,Graduate,No,4158,709.0,115.0,360.0,1.0,Urban,Y
978,LP002980,Male,No,0,Graduate,No,3250,1993.0,126.0,360.0,,Semiurban,Y
979,LP002986,Male,Yes,0,Graduate,No,5000,2393.0,158.0,360.0,1.0,Rural,N


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            981 non-null    object 
 1   Gender             957 non-null    object 
 2   Married            978 non-null    object 
 3   Dependents         956 non-null    object 
 4   Education          981 non-null    object 
 5   Self_Employed      926 non-null    object 
 6   ApplicantIncome    981 non-null    int64  
 7   CoapplicantIncome  981 non-null    float64
 8   LoanAmount         954 non-null    float64
 9   Loan_Amount_Term   961 non-null    float64
 10  Credit_History     902 non-null    float64
 11  Property_Area      981 non-null    object 
 12  Loan_Status        981 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 99.8+ KB


In [None]:
df = df.dropna(how='any', axis = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 769 entries, 1 to 980
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            769 non-null    object 
 1   Gender             769 non-null    object 
 2   Married            769 non-null    object 
 3   Dependents         769 non-null    object 
 4   Education          769 non-null    object 
 5   Self_Employed      769 non-null    object 
 6   ApplicantIncome    769 non-null    int64  
 7   CoapplicantIncome  769 non-null    float64
 8   LoanAmount         769 non-null    float64
 9   Loan_Amount_Term   769 non-null    float64
 10  Credit_History     769 non-null    float64
 11  Property_Area      769 non-null    object 
 12  Loan_Status        769 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 84.1+ KB


I then want to check to see the breakdown of values for the outcome variable, `Loan_Status`.

In [None]:
target_counts = df['Loan_Status'].value_counts()
target_counts

Y    561
N    208
Name: Loan_Status, dtype: int64

In [None]:
df = df.drop(['Loan_ID'], axis = 1)

### Encode categorical variables

In [None]:
# Encode Male as 1, Female as 0
df.loc[df.Gender == 'Male', 'Gender'] = 1
df.loc[df.Gender == 'Female', 'Gender'] = 0
# Encode Y Loan_Status as 1, N Loan_Status as 0
df.loc[df.Loan_Status == 'Y', 'Loan_Status'] = 1
df.loc[df.Loan_Status == 'N', 'Loan_Status'] = 0
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,1,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,1,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,1,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
5,1,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,1
...,...,...,...,...,...,...,...,...,...,...,...,...
975,1,Yes,1,Graduate,No,2269,2167.0,99.0,360.0,1.0,Semiurban,1
976,1,Yes,3+,Not Graduate,Yes,4009,1777.0,113.0,360.0,1.0,Urban,1
977,1,Yes,0,Graduate,No,4158,709.0,115.0,360.0,1.0,Urban,1
979,1,Yes,0,Graduate,No,5000,2393.0,158.0,360.0,1.0,Rural,0


In [None]:
y = df['Loan_Status']
y

1      0
2      1
3      1
4      1
5      1
      ..
975    1
976    1
977    1
979    0
980    1
Name: Loan_Status, Length: 769, dtype: object

In [None]:
# Replace the categorical values with the numeric equivalents that we have above
categoricalFeatures = ['Property_Area', 'Married', 'Dependents', 'Education', 'Self_Employed']
# Iterate through the list of categorical features and one hot encode them.
for feature in categoricalFeatures:
    onehot = pd.get_dummies(df[feature], prefix=feature)
    df = df.drop(feature, axis=1)
    df = df.join(onehot)
df

Unnamed: 0,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes
1,1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,0,1,0,1,0,0,1,0,1,0
2,1,3000,0.0,66.0,360.0,1.0,1,0,0,1,0,1,1,0,0,0,1,0,0,1
3,1,2583,2358.0,120.0,360.0,1.0,1,0,0,1,0,1,1,0,0,0,0,1,1,0
4,1,6000,0.0,141.0,360.0,1.0,1,0,0,1,1,0,1,0,0,0,1,0,1,0
5,1,5417,4196.0,267.0,360.0,1.0,1,0,0,1,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975,1,2269,2167.0,99.0,360.0,1.0,1,0,1,0,0,1,0,1,0,0,1,0,1,0
976,1,4009,1777.0,113.0,360.0,1.0,1,0,0,1,0,1,0,0,0,1,0,1,0,1
977,1,4158,709.0,115.0,360.0,1.0,1,0,0,1,0,1,1,0,0,0,1,0,1,0
979,1,5000,2393.0,158.0,360.0,1.0,0,1,0,0,0,1,1,0,0,0,1,0,1,0


### Separate dataset by x and y

In [None]:
from sklearn.model_selection import train_test_split
encoded_df = df.copy()
x = df.drop(['Loan_Status'], axis = 1)

### Create Test and Train splits

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_std = scaler.fit_transform(x)
# We will follow an 80-20 split pattern for our training and test data, respectively
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state = 0)

### Calculating actual disparate impact on testing values from original dataset
Disparate Impact is defined as the ratio of favorable outcomes for the unpriviliged group divided by the ratio of favorable outcomes for the priviliged group.
The acceptable threshold is between .8 and 1.25, with .8 favoring the priviliged group, and 1.25 favoring the unpriviliged group.

In [None]:
actual_test = x_test.copy()
actual_test['Loan_Status_Actual'] = y_test
actual_test.shape

(154, 20)

In [None]:
# Priviliged group: Males (1)
# Unpriviliged group: Females (0)
male_df = actual_test[actual_test['Gender'] == 1]
num_of_priviliged = male_df.shape[0]
female_df = actual_test[actual_test['Gender'] == 0]
num_of_unpriviliged = female_df.shape[0]

In [None]:
unpriviliged_outcomes = female_df[female_df['Loan_Status_Actual'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.6

In [None]:
priviliged_outcomes = male_df[male_df['Loan_Status_Actual'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.7226890756302521

In [None]:
# Calculating disparate impact
disparate_ratio = []
disparate_impact = unpriviliged_ratio / priviliged_ratio
print("Disparate Impact, Sex vs. Predicted Loan Status: " + str(disparate_impact))
disparate_ratio.append(disparate_impact)

Disparate Impact, Sex vs. Predicted Loan Status: 0.8302325581395349


### Training a model on the original dataset

In [None]:
from sklearn.linear_model import LogisticRegression
# Liblinear is a solver that is very fast for small datasets, like ours
model = LogisticRegression(solver='liblinear', class_weight='balanced')


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert the target variable to categorical labels
y_train_encoded = label_encoder.fit_transform(y_train)


In [None]:
y_train_encoded


array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,

In [None]:
y_train = y_train_encoded

In [None]:
x_train_final = x_train
y_train_final = y_train

In [None]:
model.fit(x_train, y_train)

### Evaluating performance

In [None]:
# Let's see how well it predicted with a couple values
y_pred = pd.Series(model.predict(x_test))
y_test = y_test.reset_index(drop=True)
z = pd.concat([y_test, y_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()

Unnamed: 0,True,Prediction
0,1,1
1,1,1
2,0,0
3,0,0
4,0,1


In [None]:
y_pred_encoded = label_encoder.fit_transform(y_pred)
y_pred =  y_pred_encoded
y_test_encoded = label_encoder.fit_transform(y_test)
y_test = y_test_encoded

In [None]:
Accuracy = []
Recall = []
Precision = []
Dir = []

In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
Accuracy.append(metrics.accuracy_score(y_test, y_pred))
Precision.append(metrics.precision_score(y_test, y_pred))
Recall.append(metrics.recall_score(y_test, y_pred))

Accuracy: 0.8116883116883117
Precision: 0.875
Recall: 0.8504672897196262


### Calculating disparate impact on predicted values by model trained on original dataset

In [None]:
# We now need to add this array into x_test as a column for when we calculate the fairness metrics.
y_pred = model.predict(x_test)
x_test['Loan_Status_Predicted'] = y_pred
original_output = x_test
original_output

Unnamed: 0,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Loan_Status_Predicted
840,1,2553,1768.0,102.0,360.0,1.0,0,0,1,0,1,1,0,0,0,1,0,1,0,1
159,1,4583,5625.0,255.0,360.0,1.0,0,1,0,0,1,1,0,0,0,1,0,1,0,1
148,0,10000,1666.0,225.0,360.0,1.0,1,0,0,1,0,1,0,0,0,1,0,1,0,0
17,0,3510,0.0,76.0,360.0,0.0,0,0,1,1,0,1,0,0,0,1,0,1,0,0
808,1,10000,2690.0,412.0,360.0,1.0,0,1,0,0,1,0,1,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,1,8072,240.0,253.0,360.0,1.0,0,0,1,0,1,0,1,0,0,1,0,1,0,1
471,1,2653,1500.0,113.0,180.0,0.0,1,0,0,0,1,0,1,0,0,0,1,1,0,0
291,1,4400,0.0,127.0,360.0,0.0,0,1,0,0,1,0,0,1,0,1,0,1,0,0
797,0,4000,3917.0,173.0,360.0,1.0,1,0,0,0,1,0,1,0,0,1,0,1,0,0


In [None]:

# Priviliged group: Males (1)
# Unpriviliged group: Females (0)
male_df = original_output[original_output['Gender'] == 1]
num_of_priviliged = male_df.shape[0]
female_df = original_output[original_output['Gender'] == 0]
num_of_unpriviliged = female_df.shape[0]

In [None]:
unpriviliged_outcomes = female_df[female_df['Loan_Status_Predicted'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.4857142857142857

In [None]:
priviliged_outcomes = male_df[male_df['Loan_Status_Predicted'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.7310924369747899

In [None]:
# Calculating disparate impact
disparate_impact = unpriviliged_ratio / priviliged_ratio
Dir.append(disparate_impact)
print("Disparate Impact, Sex vs. Predicted Loan Status: " + str(disparate_impact))

Disparate Impact, Sex vs. Predicted Loan Status: 0.664367816091954


### Applying the Disparate Impact Remover to the dataset

In [None]:
# We are going to be using the dataset with categorical features encoded, encoded_df
encoded_df

Unnamed: 0,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes
1,1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,0,1,0,1,0,0,1,0,1,0
2,1,3000,0.0,66.0,360.0,1.0,1,0,0,1,0,1,1,0,0,0,1,0,0,1
3,1,2583,2358.0,120.0,360.0,1.0,1,0,0,1,0,1,1,0,0,0,0,1,1,0
4,1,6000,0.0,141.0,360.0,1.0,1,0,0,1,1,0,1,0,0,0,1,0,1,0
5,1,5417,4196.0,267.0,360.0,1.0,1,0,0,1,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975,1,2269,2167.0,99.0,360.0,1.0,1,0,1,0,0,1,0,1,0,0,1,0,1,0
976,1,4009,1777.0,113.0,360.0,1.0,1,0,0,1,0,1,0,0,0,1,0,1,0,1
977,1,4158,709.0,115.0,360.0,1.0,1,0,0,1,0,1,1,0,0,0,1,0,1,0
979,1,5000,2393.0,158.0,360.0,1.0,0,1,0,0,0,1,1,0,0,0,1,0,1,0


## **AIF360 with Repair level 1.0**

In [None]:
import aif360
from aif360.algorithms.preprocessing import DisparateImpactRemover
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=encoded_df,
    label_names=['Loan_Status'],
    protected_attribute_names=['Gender'])
di = DisparateImpactRemover(repair_level = 1.0)
dataset_transf_train = di.fit_transform(binaryLabelDataset)
transformed = dataset_transf_train.convert_to_dataframe()[0]
transformed

Unnamed: 0,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Loan_Status
1,1.0,3958.0,1483.0,108.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,2600.0,0.0,59.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,1.0,2241.0,2333.0,102.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
4,1.0,4723.0,0.0,115.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
5,1.0,4402.0,3683.0,189.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975,1.0,2101.0,2183.0,79.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
976,1.0,3719.0,1762.0,94.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
977,1.0,3762.0,717.0,95.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
979,1.0,4230.0,2333.0,130.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


# Train a model using the dataset that underwent the pre-processing

In [None]:
x_trans = transformed.drop(['Loan_Status'], axis = 1)
y = transformed['Loan_Status']
# Liblinear is a solver that is effective for relatively smaller datasets.
model = LogisticRegression(solver='liblinear', class_weight='balanced')
scaler = StandardScaler()
data_std = scaler.fit_transform(x_trans)
# Splitting into test and training
# We will follow an 80-20 split pattern for our training and test data
x_trans_train,x_trans_test,y_trans_train,y_trans_test = train_test_split(x_trans, y, test_size=0.2, random_state = 0)

In [None]:
y_trans_train.info()

<class 'pandas.core.series.Series'>
Index: 615 entries, 615 to 867
Series name: Loan_Status
Non-Null Count  Dtype  
--------------  -----  
615 non-null    float64
dtypes: float64(1)
memory usage: 9.6+ KB


In [None]:
model.fit(x_trans_train, y_trans_train)

### Evaluating performance

In [None]:
# See how well it predicted with a couple values
y_trans_pred = pd.Series(model.predict(x_trans_test))
y_trans_test = y_trans_test.reset_index(drop=True)
z = pd.concat([y_trans_test, y_trans_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()
# Again, it predicts 4/5 correctly in this sample

Unnamed: 0,True,Prediction
0,1.0,1.0
1,1.0,1.0
2,0.0,0.0
3,0.0,0.0
4,0.0,1.0


In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_trans_pred))
print("Precision:", metrics.precision_score(y_test, y_trans_pred))
print("Recall:", metrics.recall_score(y_test, y_trans_pred))
Accuracy.append(metrics.accuracy_score(y_test, y_trans_pred))
Precision.append(metrics.precision_score(y_test, y_trans_pred))
Recall.append(metrics.recall_score(y_test, y_trans_pred))

Accuracy: 0.8246753246753247
Precision: 0.8846153846153846
Recall: 0.8598130841121495


### Calculating disparate impact on predicted values by model trained on transformed dataset

In [None]:
# We now need to add this array into x_test as a column for when we calculate the fairness metrics.
y_trans_pred = model.predict(x_trans_test)
x_trans_test['Loan_Status_Predicted'] = y_trans_pred
transformed_output = x_trans_test
transformed_output

Unnamed: 0,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Loan_Status_Predicted
840,1.0,2226.0,1742.0,81.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
159,1.0,3958.0,5105.0,185.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
148,0.0,9504.0,1646.0,225.0,300.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
17,0.0,3510.0,0.0,76.0,300.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
808,1.0,10000.0,2541.0,300.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,1.0,7600.0,0.0,182.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
471,1.0,2330.0,1483.0,94.0,180.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
291,1.0,3846.0,0.0,105.0,360.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
797,0.0,4000.0,3917.0,173.0,300.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


Disparate Impact is defined as the ratio of favorable outcomes for the unpriviliged group divided by the ratio of favorable outcomes for the priviliged group. The acceptable threshold is between .8 and 1.25, with .8 favoring the priviliged group, and 1.25 favoring the unpriviliged group.

In [None]:
# Priviliged group: Males (1)
# Unpriviliged group: Females (0)
male_df = transformed_output[transformed_output['Gender'] == 1]
num_of_priviliged = male_df.shape[0]
female_df = transformed_output[transformed_output['Gender'] == 0]
num_of_unpriviliged = female_df.shape[0]

In [None]:
unpriviliged_outcomes = female_df[female_df['Loan_Status_Predicted'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.5142857142857142

In [None]:
priviliged_outcomes = male_df[male_df['Loan_Status_Predicted'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.7226890756302521

In [None]:
# Calculating disparate impact
disparate_impact = unpriviliged_ratio / priviliged_ratio
Dir.append(disparate_impact)
print("Disparate Impact, Sex vs. Predicted Loan Status: " + str(disparate_impact))

Disparate Impact, Sex vs. Predicted Loan Status: 0.7116279069767442


## **AIF360 with Repair level 0.5**

In [None]:
import aif360
from aif360.algorithms.preprocessing import DisparateImpactRemover
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=encoded_df,
    label_names=['Loan_Status'],
    protected_attribute_names=['Gender'])
di = DisparateImpactRemover(repair_level = 0.5)
dataset_transf_train = di.fit_transform(binaryLabelDataset)
transformed = dataset_transf_train.convert_to_dataframe()[0]


x_trans = transformed.drop(['Loan_Status'], axis = 1)
y = transformed['Loan_Status']
# Liblinear is a solver that is effective for relatively smaller datasets.
model = LogisticRegression(solver='liblinear', class_weight='balanced')
scaler = StandardScaler()
data_std = scaler.fit_transform(x_trans)
# Splitting into test and training
# We will follow an 80-20 split pattern for our training and test data
x_trans_train,x_trans_test,y_trans_train,y_trans_test = train_test_split(x_trans, y, test_size=0.2, random_state = 0)

model.fit(x_trans_train, y_trans_train)

y_trans_pred = pd.Series(model.predict(x_trans_test))
y_trans_test = y_trans_test.reset_index(drop=True)
z = pd.concat([y_trans_test, y_trans_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()

print("Accuracy:", metrics.accuracy_score(y_test, y_trans_pred))
print("Precision:", metrics.precision_score(y_test, y_trans_pred))
print("Recall:", metrics.recall_score(y_test, y_trans_pred))

Accuracy.append(metrics.accuracy_score(y_test, y_trans_pred))
Precision.append(metrics.precision_score(y_test, y_trans_pred))
Recall.append(metrics.recall_score(y_test, y_trans_pred))

Accuracy: 0.8311688311688312
Precision: 0.8857142857142857
Recall: 0.8691588785046729


In [None]:
y_trans_pred = model.predict(x_trans_test)
x_trans_test['Loan_Status_Predicted'] = y_trans_pred
transformed_output = x_trans_test

male_df = transformed_output[transformed_output['Gender'] == 1]
num_of_priviliged = male_df.shape[0]
female_df = transformed_output[transformed_output['Gender'] == 0]
num_of_unpriviliged = female_df.shape[0]

unpriviliged_outcomes = female_df[female_df['Loan_Status_Predicted'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.5142857142857142

In [None]:
priviliged_outcomes = male_df[male_df['Loan_Status_Predicted'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.7310924369747899

In [None]:
# Calculating disparate impact
disparate_impact = unpriviliged_ratio / priviliged_ratio
print("Disparate Impact, Sex vs. Predicted Loan Status: " + str(disparate_impact))
Dir.append(disparate_impact)
disparate_ratio.append(disparate_impact)

Disparate Impact, Sex vs. Predicted Loan Status: 0.7034482758620689


# **Fairlearn Algorithm**

In [None]:
from fairlearn.preprocessing import CorrelationRemover
import pandas as pd
data = encoded_df
data = data.set_index(pd.RangeIndex(start=0, stop = 769))
selected_columns = ['Gender', 'Married_No', 'ApplicantIncome', 'Self_Employed_Yes',  'LoanAmount']

X = data[selected_columns].copy()
# X = pd.get_dummies(X)

cr = CorrelationRemover(sensitive_feature_ids=['Gender'])
cr.fit(X)
CorrelationRemover(sensitive_feature_ids=['Gender'])
X_transform = cr.transform(X)
X_transform

array([[ 7.35595679e-02,  4.46648420e+03, -5.57458904e-03,
         1.24772761e+02],
       [ 7.35595679e-02,  2.88348420e+03,  9.94425411e-01,
         6.27727610e+01],
       [ 7.35595679e-02,  2.46648420e+03, -5.57458904e-03,
         1.16772761e+02],
       ...,
       [ 7.35595679e-02,  4.04148420e+03, -5.57458904e-03,
         1.11772761e+02],
       [ 7.35595679e-02,  4.88348420e+03, -5.57458904e-03,
         1.54772761e+02],
       [ 1.07355957e+00,  9.08348420e+03,  9.94425411e-01,
         9.47727610e+01]])

In [None]:
X_transform_df = pd.DataFrame(X_transform)
X_transform_df
# data_indexed = X_transform_df.set_index(pd.RangeIndex(start=0, stop = 769))

gender_save = data['Gender']
data_new_indexed = data.drop(['Gender', 'Married_No', 'ApplicantIncome', 'Self_Employed_Yes',  'LoanAmount'], axis = 1)
data_new_indexed
# # X_transform_df


Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No
0,1508.0,360.0,1.0,0,1,0,0,1,0,1,0,0,1,0,1
1,0.0,360.0,1.0,1,0,0,1,1,1,0,0,0,1,0,0
2,2358.0,360.0,1.0,1,0,0,1,1,1,0,0,0,0,1,1
3,0.0,360.0,1.0,1,0,0,1,0,1,0,0,0,1,0,1
4,4196.0,360.0,1.0,1,0,0,1,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,2167.0,360.0,1.0,1,0,1,0,1,0,1,0,0,1,0,1
765,1777.0,360.0,1.0,1,0,0,1,1,0,0,0,1,0,1,0
766,709.0,360.0,1.0,1,0,0,1,1,1,0,0,0,1,0,1
767,2393.0,360.0,1.0,0,1,0,0,1,1,0,0,0,1,0,1


In [None]:
gender_save

0      1
1      1
2      1
3      1
4      1
      ..
764    1
765    1
766    1
767    1
768    1
Name: Gender, Length: 769, dtype: object

In [None]:
# print("data_new ", data_new.shape)
# print("x_transform ", X_transform_df.shape)

transformed_noname = pd.concat([data_new_indexed, X_transform_df], axis = 1)
transformed_noname

Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,0,1,2,3
0,1508.0,360.0,1.0,0,1,0,0,1,0,1,0,0,1,0,1,0.07356,4466.484195,-0.005575,124.772761
1,0.0,360.0,1.0,1,0,0,1,1,1,0,0,0,1,0,0,0.07356,2883.484195,0.994425,62.772761
2,2358.0,360.0,1.0,1,0,0,1,1,1,0,0,0,0,1,1,0.07356,2466.484195,-0.005575,116.772761
3,0.0,360.0,1.0,1,0,0,1,0,1,0,0,0,1,0,1,1.07356,5883.484195,-0.005575,137.772761
4,4196.0,360.0,1.0,1,0,0,1,1,0,0,1,0,1,0,0,0.07356,5300.484195,0.994425,263.772761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,2167.0,360.0,1.0,1,0,1,0,1,0,1,0,0,1,0,1,0.07356,2152.484195,-0.005575,95.772761
765,1777.0,360.0,1.0,1,0,0,1,1,0,0,0,1,0,1,0,0.07356,3892.484195,0.994425,109.772761
766,709.0,360.0,1.0,1,0,0,1,1,1,0,0,0,1,0,1,0.07356,4041.484195,-0.005575,111.772761
767,2393.0,360.0,1.0,0,1,0,0,1,1,0,0,0,1,0,1,0.07356,4883.484195,-0.005575,154.772761


In [None]:
transformed = transformed_noname.rename(columns= { 0: 'Married_No' , 1 : 'ApplicantIncome', 2 : 'Self_Employed_Yes', 3 : 'LoanAmount'  })

y = transformed['Loan_Status']
x_trans = transformed.drop(['Loan_Status'], axis = 1)
transformed


Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Married_No,ApplicantIncome,Self_Employed_Yes,LoanAmount
0,1508.0,360.0,1.0,0,1,0,0,1,0,1,0,0,1,0,1,0.07356,4466.484195,-0.005575,124.772761
1,0.0,360.0,1.0,1,0,0,1,1,1,0,0,0,1,0,0,0.07356,2883.484195,0.994425,62.772761
2,2358.0,360.0,1.0,1,0,0,1,1,1,0,0,0,0,1,1,0.07356,2466.484195,-0.005575,116.772761
3,0.0,360.0,1.0,1,0,0,1,0,1,0,0,0,1,0,1,1.07356,5883.484195,-0.005575,137.772761
4,4196.0,360.0,1.0,1,0,0,1,1,0,0,1,0,1,0,0,0.07356,5300.484195,0.994425,263.772761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,2167.0,360.0,1.0,1,0,1,0,1,0,1,0,0,1,0,1,0.07356,2152.484195,-0.005575,95.772761
765,1777.0,360.0,1.0,1,0,0,1,1,0,0,0,1,0,1,0,0.07356,3892.484195,0.994425,109.772761
766,709.0,360.0,1.0,1,0,0,1,1,1,0,0,0,1,0,1,0.07356,4041.484195,-0.005575,111.772761
767,2393.0,360.0,1.0,0,1,0,0,1,1,0,0,0,1,0,1,0.07356,4883.484195,-0.005575,154.772761


In [None]:

# Liblinear is a solver that is effective for relatively smaller datasets.
model = LogisticRegression(solver='liblinear', class_weight='balanced')
scaler = StandardScaler()
data_std = scaler.fit_transform(x_trans)

# Splitting into test and training
# We will follow an 80-20 split pattern for our training and test data


In [None]:
y=y.astype('float64')
y.info()
x_trans_train,x_trans_test,y_trans_train,y_trans_test = train_test_split(x_trans, y, test_size=0.2, random_state = 0)

<class 'pandas.core.series.Series'>
RangeIndex: 769 entries, 0 to 768
Series name: Loan_Status
Non-Null Count  Dtype  
--------------  -----  
769 non-null    float64
dtypes: float64(1)
memory usage: 6.1 KB


In [None]:
model.fit(x_trans_train, y_trans_train)

In [None]:
# See how well it predicted with a couple values
y_trans_pred = pd.Series(model.predict(x_trans_test))
y_trans_test = y_trans_test.reset_index(drop=True)
z = pd.concat([y_trans_test, y_trans_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()
# Again, it predicts 4/5 correctly in this sample

Unnamed: 0,True,Prediction
0,1.0,1.0
1,1.0,1.0
2,0.0,0.0
3,0.0,0.0
4,0.0,1.0


In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_trans_pred))
print("Precision:", metrics.precision_score(y_test, y_trans_pred))
print("Recall:", metrics.recall_score(y_test, y_trans_pred))
Accuracy.append(metrics.accuracy_score(y_test, y_trans_pred))
Precision.append(metrics.precision_score(y_test, y_trans_pred))
Recall.append(metrics.recall_score(y_test, y_trans_pred))

Accuracy: 0.8246753246753247
Precision: 0.8773584905660378
Recall: 0.8691588785046729


In [None]:
y_test2 = y_test
y_pred2 = y_trans_pred


In [None]:
# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, auc

# # Compute false positive rate (fpr), true positive rate (tpr), and thresholds
# fpr, tpr, thresholds = roc_curve(y_test1, y_pred1)

# # Compute the area under the curve (AUC)
# roc_auc = auc(fpr, tpr)

# # Compute false positive rate (fpr), true positive rate (tpr), and thresholds
# fpr2, tpr2, thresholds2 = roc_curve(y_test2, y_pred2)

# # Compute the area under the curve (AUC)
# roc_auc2 = auc(fpr2, tpr2)

# # Plot the ROC curve
# plt.plot(fpr2, tpr2, color='green', label=f'AUC for Fairlearn = {roc_auc2:.2f}')
# plt.plot(fpr, tpr, color='blue', label=f'AUC = {roc_auc:.2f}')
# plt.plot([0, 1], [0, 1], color='red', linestyle='--')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.legend(loc='lower right')

# plt.show()

In [None]:
y_trans_pred = model.predict(x_trans_test)
x_trans_test['Loan_Status_Predicted'] = y_trans_pred
transformed_output = x_trans_test
transformed_output

Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Married_No,ApplicantIncome,Self_Employed_Yes,LoanAmount,Loan_Status_Predicted
662,1768.0,360.0,1.0,0,0,1,1,1,0,0,0,1,0,1,0.07356,2436.484195,-0.005575,98.772761,1.0
122,5625.0,360.0,1.0,0,1,0,1,1,0,0,0,1,0,1,0.07356,4466.484195,-0.005575,251.772761,1.0
113,1666.0,360.0,1.0,1,0,0,0,1,0,0,0,1,0,1,0.68344,10501.419739,0.023990,238.888256,0.0
14,0.0,360.0,0.0,0,0,1,0,1,0,0,0,1,0,1,0.68344,4011.419739,0.023990,89.888256,0.0
634,2690.0,360.0,1.0,0,1,0,1,0,1,0,0,1,0,1,0.07356,9883.484195,-0.005575,408.772761,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,240.0,360.0,1.0,0,0,1,1,0,1,0,0,1,0,1,0.07356,7955.484195,-0.005575,249.772761,1.0
367,1500.0,180.0,0.0,1,0,0,1,0,1,0,0,0,1,1,0.07356,2536.484195,-0.005575,109.772761,0.0
231,0.0,360.0,0.0,0,1,0,1,0,0,1,0,1,0,1,0.07356,4283.484195,-0.005575,123.772761,0.0
627,3917.0,360.0,1.0,1,0,0,1,0,1,0,0,1,0,1,-0.31656,4501.419739,0.023990,186.888256,0.0


In [None]:
transformed_output = pd.concat([transformed_output, gender_save], axis = 1)
transformed_output

Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Married_No,ApplicantIncome,Self_Employed_Yes,LoanAmount,Loan_Status_Predicted,Gender
662,1768.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.07356,2436.484195,-0.005575,98.772761,1.0,1
122,5625.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.07356,4466.484195,-0.005575,251.772761,1.0,1
113,1666.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.68344,10501.419739,0.023990,238.888256,0.0,0
14,0.0,360.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.68344,4011.419739,0.023990,89.888256,0.0,0
634,2690.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.07356,9883.484195,-0.005575,408.772761,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,,,,,,,,,,,,,,,,,,,,1
765,,,,,,,,,,,,,,,,,,,,1
766,,,,,,,,,,,,,,,,,,,,1
767,,,,,,,,,,,,,,,,,,,,1


In [None]:
transformed_output.dropna(how='any',axis = 0)
# transformed_output
transformed_output = transformed_output.dropna()
transformed_output

Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Married_No,ApplicantIncome,Self_Employed_Yes,LoanAmount,Loan_Status_Predicted,Gender
662,1768.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.07356,2436.484195,-0.005575,98.772761,1.0,1
122,5625.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.07356,4466.484195,-0.005575,251.772761,1.0,1
113,1666.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.68344,10501.419739,0.023990,238.888256,0.0,0
14,0.0,360.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.68344,4011.419739,0.023990,89.888256,0.0,0
634,2690.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.07356,9883.484195,-0.005575,408.772761,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,240.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.07356,7955.484195,-0.005575,249.772761,1.0,1
367,1500.0,180.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.07356,2536.484195,-0.005575,109.772761,0.0,1
231,0.0,360.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.07356,4283.484195,-0.005575,123.772761,0.0,1
627,3917.0,360.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,-0.31656,4501.419739,0.023990,186.888256,0.0,0


In [None]:
male_df = transformed_output[transformed_output['Gender'] == 1]
num_of_priviliged = male_df.shape[0]
female_df = transformed_output[transformed_output['Gender'] == 0]
num_of_unpriviliged = female_df.shape[0]

In [None]:
unpriviliged_outcomes = female_df[female_df['Loan_Status_Predicted'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.5714285714285714

In [None]:
priviliged_outcomes = male_df[male_df['Loan_Status_Predicted'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.7226890756302521

In [None]:
# Calculating disparate impact
disparate_impact = unpriviliged_ratio / priviliged_ratio
Dir.append(disparate_impact)
print("Disparate Impact, Sex vs. Predicted Loan Status: " + str(disparate_impact))


Disparate Impact, Sex vs. Predicted Loan Status: 0.7906976744186046


# **AIF + Fairlearn**


In [None]:
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=encoded_df,
    label_names=['Loan_Status'],
    protected_attribute_names=['Gender'])
di = DisparateImpactRemover(repair_level = 1.0)
dataset_transf_train = di.fit_transform(binaryLabelDataset)
transformed = dataset_transf_train.convert_to_dataframe()[0]
transformed = transformed.set_index(pd.RangeIndex(start=0, stop = 769))
data = transformed
data

Unnamed: 0,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Loan_Status
0,1.0,3958.0,1483.0,108.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,2600.0,0.0,59.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,1.0,2241.0,2333.0,102.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
3,1.0,4723.0,0.0,115.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,4402.0,3683.0,189.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,1.0,2101.0,2183.0,79.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
765,1.0,3719.0,1762.0,94.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
766,1.0,3762.0,717.0,95.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
767,1.0,4230.0,2333.0,130.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [None]:
selected_columns = ['Gender', 'Married_No', 'ApplicantIncome', 'Self_Employed_Yes',  'LoanAmount']

X = data[selected_columns].copy()
# X = pd.get_dummies(X)

cr = CorrelationRemover(sensitive_feature_ids=['Gender'])
cr.fit(X)
CorrelationRemover(sensitive_feature_ids=['Gender'])
X_transform = cr.transform(X)
X_transform

array([[ 7.35595679e-02,  3.98337882e+03, -5.57458904e-03,
         1.09968855e+02],
       [ 7.35595679e-02,  2.62537882e+03,  9.94425411e-01,
         6.09688552e+01],
       [ 7.35595679e-02,  2.26637882e+03, -5.57458904e-03,
         1.03968855e+02],
       ...,
       [ 7.35595679e-02,  3.78737882e+03, -5.57458904e-03,
         9.69688552e+01],
       [ 7.35595679e-02,  4.25537882e+03, -5.57458904e-03,
         1.31968855e+02],
       [ 1.07355957e+00,  8.64937882e+03,  9.94425411e-01,
         8.09688552e+01]])

In [None]:
X_transform_df = pd.DataFrame(X_transform)
X_transform_df
# data_indexed = X_transform_df.set_index(pd.RangeIndex(start=0, stop = 769))

gender_save = data['Gender']
data_new_indexed = data.drop(['Gender', 'Married_No', 'ApplicantIncome', 'Self_Employed_Yes',  'LoanAmount'], axis = 1)
data_new_indexed
# # X_transform_df


Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Loan_Status
0,1483.0,360.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,2333.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,0.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
4,3683.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,2183.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
765,1762.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
766,717.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
767,2333.0,360.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [None]:
# print("data_new ", data_new.shape)
# print("x_transform ", X_transform_df.shape)

transformed_noname = pd.concat([data_new_indexed, X_transform_df], axis = 1)
transformed_noname

Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Loan_Status,0,1,2,3
0,1483.0,360.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.07356,3983.378816,-0.005575,109.968855
1,0.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.07356,2625.378816,0.994425,60.968855
2,2333.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.07356,2266.378816,-0.005575,103.968855
3,0.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.07356,4748.378816,-0.005575,116.968855
4,3683.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.07356,4427.378816,0.994425,190.968855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,2183.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.07356,2126.378816,-0.005575,80.968855
765,1762.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.07356,3744.378816,0.994425,95.968855
766,717.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.07356,3787.378816,-0.005575,96.968855
767,2333.0,360.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.07356,4255.378816,-0.005575,131.968855


In [None]:
transformed = transformed_noname.rename(columns= { 0: 'Married_No' , 1 : 'ApplicantIncome', 2 : 'Self_Employed_Yes', 3 : 'LoanAmount'  })

y = transformed['Loan_Status']
x_trans = transformed.drop(['Loan_Status'], axis = 1)
transformed


Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Loan_Status,Married_No,ApplicantIncome,Self_Employed_Yes,LoanAmount
0,1483.0,360.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.07356,3983.378816,-0.005575,109.968855
1,0.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.07356,2625.378816,0.994425,60.968855
2,2333.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.07356,2266.378816,-0.005575,103.968855
3,0.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.07356,4748.378816,-0.005575,116.968855
4,3683.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.07356,4427.378816,0.994425,190.968855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,2183.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.07356,2126.378816,-0.005575,80.968855
765,1762.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.07356,3744.378816,0.994425,95.968855
766,717.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.07356,3787.378816,-0.005575,96.968855
767,2333.0,360.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.07356,4255.378816,-0.005575,131.968855


In [None]:
# Liblinear is a solver that is effective for relatively smaller datasets.
model = LogisticRegression(solver='liblinear', class_weight='balanced')
scaler = StandardScaler()
data_std = scaler.fit_transform(x_trans)

# Splitting into test and training
# We will follow an 80-20 split pattern for our training and test data


In [None]:
y=y.astype('float64')
y.info()
x_trans_train,x_trans_test,y_trans_train,y_trans_test = train_test_split(x_trans, y, test_size=0.2, random_state = 0)

<class 'pandas.core.series.Series'>
RangeIndex: 769 entries, 0 to 768
Series name: Loan_Status
Non-Null Count  Dtype  
--------------  -----  
769 non-null    float64
dtypes: float64(1)
memory usage: 6.1 KB


In [None]:
model.fit(x_trans_train, y_trans_train)

In [None]:
# See how well it predicted with a couple values
y_trans_pred = pd.Series(model.predict(x_trans_test))
y_trans_test = y_trans_test.reset_index(drop=True)
z = pd.concat([y_trans_test, y_trans_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()
# Again, it predicts 4/5 correctly in this sample

Unnamed: 0,True,Prediction
0,1.0,1.0
1,1.0,1.0
2,0.0,0.0
3,0.0,0.0
4,0.0,1.0


In [None]:
# print("Accuracy:", metrics.accuracy_score(y_test, y_trans_pred))
# print("Precision:", metrics.precision_score(y_test, y_trans_pred))
# print("Recall:", metrics.recall_score(y_test, y_trans_pred))
# Accuracy.append(metrics.accuracy_score(y_test, y_trans_pred))
# Precision.append(metrics.precision_score(y_test, y_trans_pred))
# Recall.append(metrics.recall_score(y_test, y_trans_pred))

print("Accuracy:", metrics.accuracy_score(y_test, y_trans_pred))
print("Precision:", metrics.precision_score(y_test, y_trans_pred))
print("Recall:", metrics.recall_score(y_test, y_trans_pred))
Accuracy.append(metrics.accuracy_score(y_test, y_pred))
Precision.append(metrics.precision_score(y_test, y_pred))
Recall.append(metrics.recall_score(y_test, y_pred))

Accuracy: 0.8311688311688312
Precision: 0.8857142857142857
Recall: 0.8691588785046729


In [None]:
y_test2 = y_test
y_pred2 = y_trans_pred


In [None]:
y_trans_pred = model.predict(x_trans_test)
x_trans_test['Loan_Status_Predicted'] = y_trans_pred
transformed_output = x_trans_test
transformed_output

Unnamed: 0,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Married_No,ApplicantIncome,Self_Employed_Yes,LoanAmount,Loan_Status_Predicted
662,1742.0,360.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.07356,2251.378816,-0.005575,82.968855,1.0
122,5105.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.07356,3983.378816,-0.005575,186.968855,1.0
113,1646.0,300.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.68344,9394.783579,0.023990,216.527133,0.0
14,0.0,300.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.68344,3400.783579,0.023990,67.527133,0.0
634,2541.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.07356,10025.378816,-0.005575,301.968855,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,0.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.07356,7625.378816,-0.005575,183.968855,1.0
367,1483.0,180.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.07356,2355.378816,-0.005575,95.968855,0.0
231,0.0,360.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.07356,3871.378816,-0.005575,106.968855,0.0
627,3917.0,300.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,-0.31656,3890.783579,0.023990,164.527133,1.0


In [None]:
transformed_output = pd.concat([transformed_output, gender_save], axis = 1)

In [None]:
transformed_output.dropna(how='any',axis = 0)
# transformed_output
transformed_output = transformed_output.dropna()

In [None]:
male_df = transformed_output[transformed_output['Gender'] == 1]
num_of_priviliged = male_df.shape[0]
female_df = transformed_output[transformed_output['Gender'] == 0]
num_of_unpriviliged = female_df.shape[0]

In [None]:
unpriviliged_outcomes = female_df[female_df['Loan_Status_Predicted'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.5714285714285714

In [None]:
priviliged_outcomes = male_df[male_df['Loan_Status_Predicted'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.7142857142857143

In [None]:
disparate_impact = unpriviliged_ratio / priviliged_ratio
Dir.append(disparate_impact)
print("Disparate Impact, Sex vs. Predicted Loan Status: " + str(disparate_impact))


Disparate Impact, Sex vs. Predicted Loan Status: 0.7999999999999999


# **Final Results**

In [None]:
from prettytable import PrettyTable
columns = ['Without pre-processing', 'AIF360 with repair level 1.0', 'AIF360 with repair level 0.5', 'Fairlearn', 'AIF360 + FairLearn']

table = PrettyTable()
table.field_names = ["Method", "Accuracy", "Precision", "Recall", "Disparity Impact"]

for data in zip(columns, Accuracy, Precision, Recall, Dir):
    table.add_row(data)
print(Accuracy)
print(table)
print("The disparity impact ratio on actual test data: 0.8302325581395349")

[0.8116883116883117, 0.8246753246753247, 0.8311688311688312, 0.8246753246753247, 0.8311688311688312]
+------------------------------+--------------------+--------------------+--------------------+--------------------+
|            Method            |      Accuracy      |     Precision      |       Recall       |  Disparity Impact  |
+------------------------------+--------------------+--------------------+--------------------+--------------------+
|    Without pre-processing    | 0.8116883116883117 |       0.875        | 0.8504672897196262 | 0.664367816091954  |
| AIF360 with repair level 1.0 | 0.8246753246753247 | 0.8846153846153846 | 0.8598130841121495 | 0.7116279069767442 |
| AIF360 with repair level 0.5 | 0.8311688311688312 | 0.8857142857142857 | 0.8691588785046729 | 0.7034482758620689 |
|          Fairlearn           | 0.8246753246753247 | 0.8773584905660378 | 0.8691588785046729 | 0.7906976744186046 |
|      AIF360 + FairLearn      | 0.8311688311688312 | 0.8857142857142857 | 0.869