In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
cols = ['status', 'duration', 'credit_hist', 'purpose', 'credit_amt', 'savings', 'employment', 'installment_rate', 'personal_status', 'debtors', 'residencesince', 'property', 'age', 'install_plans', 'housing', 'existing_credits', 'job', 'maintenance_paying_people', 'telephone', 'foreign_worker', 'result']



In [3]:
df = pd.read_table('german.data', names=cols, sep=" ", index_col=False)

In [4]:
df.head()

Unnamed: 0,status,duration,credit_hist,purpose,credit_amt,savings,employment,installment_rate,personal_status,debtors,...,property,age,install_plans,housing,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,result
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [5]:
def preprocess_german(df): 
    df['status'] = df['status'].map({'A11': 0, 'A12': 1, 'A13': 2, 'A14': 3}).astype(int)
    df['credit_hist'] = df['credit_hist'].map({'A34': 0, 'A33': 1, 'A32': 2, 'A31': 3, 'A30': 4}).astype(int)
    df.loc[(df['credit_amt'] <= 2000), 'credit_amt'] = 0
    df.loc[(df['credit_amt'] > 2000) & (df['credit_amt'] <= 5000), 'credit_amt'] = 1
    df.loc[(df['credit_amt'] > 5000), 'credit_amt'] = 2   
    df.loc[(df['duration'] <= 12), 'duration'] = 0
    df.loc[(df['duration'] > 12) & (df['duration'] <= 24), 'duration'] = 1
    df.loc[(df['duration'] > 24) & (df['duration'] <= 36), 'duration'] = 2
    df.loc[(df['duration'] > 36), 'duration'] = 3
    df['age'] = df['age'].apply(lambda x : 1 if x >= 45 else 0) # 1 if old, 0 if young
    df['savings'] = df['savings'].map({'A61': 0, 'A62': 1, 'A63': 2, 'A64': 3, 'A65': 4}).astype(int)
    df['employment'] = df['employment'].map({'A71': 0, 'A72': 1, 'A73': 2, 'A74': 3, 'A75': 4}).astype(int)    
    df['gender'] = df['personal_status'].map({'A91': 1, 'A92': 0, 'A93': 1, 'A94': 1, 'A95': 0}).astype(int)
    df['debtors'] = df['debtors'].map({'A101': 0, 'A102': 1, 'A103': 2}).astype(int)
    df['property'] = df['property'].map({'A121': 3, 'A122': 2, 'A123': 1, 'A124': 0}).astype(int)        
    df['install_plans'] = df['install_plans'].map({'A141': 1, 'A142': 1, 'A143': 0}).astype(int)
    df['job'] = df['job'].map({'A171': 0, 'A172': 1, 'A173': 2, 'A174': 3}).astype(int)    
    df['telephone'] = df['telephone'].map({'A191': 0, 'A192': 1}).astype(int)
    df['foreign_worker'] = df['foreign_worker'].map({'A201': 1, 'A202': 0}).astype(int)
    pd.get_dummies(df, columns=['purpose', 'housing'], drop_first=True)
    
    return df



In [6]:
X = preprocess_german(df);

In [7]:
df.head()

Unnamed: 0,status,duration,credit_hist,purpose,credit_amt,savings,employment,installment_rate,personal_status,debtors,...,age,install_plans,housing,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,result,gender
0,0,0,0,A43,0,4,4,4,A93,0,...,1,0,A152,2,2,1,1,1,1,1
1,1,3,2,A43,2,0,2,2,A92,0,...,0,0,A152,1,2,1,0,1,2,0
2,3,0,0,A46,1,0,3,2,A93,0,...,1,0,A152,1,1,2,0,1,1,1
3,0,3,2,A42,2,0,3,2,A93,2,...,1,0,A153,1,2,2,0,1,1,1
4,0,1,1,A40,1,0,2,3,A93,0,...,1,0,A153,2,2,2,0,1,2,1


In [8]:
df = df.drop(["purpose", "housing"], axis=1)

In [9]:
df = df.drop(["personal_status"], axis=1)

In [10]:
df.columns

Index(['status', 'duration', 'credit_hist', 'credit_amt', 'savings',
       'employment', 'installment_rate', 'debtors', 'residencesince',
       'property', 'age', 'install_plans', 'existing_credits', 'job',
       'maintenance_paying_people', 'telephone', 'foreign_worker', 'result',
       'gender'],
      dtype='object')

In [11]:
x = df[['status', 'duration', 'credit_hist', 'credit_amt', 'savings',
       'employment', 'installment_rate', 'debtors', 'residencesince',
       'property', 'age', 'install_plans', 'existing_credits', 'job',
       'maintenance_paying_people', 'telephone', 'foreign_worker',
       'gender']]

In [12]:
y = df[["result"]]

In [13]:
# groups_foreign_worker =  df.groupby(df['foreign_worker'])
# group_foreign_worker_0_df = groups_foreign_worker.get_group(0)

# rows_to_repeat1 = group_foreign_worker_0_df.iloc[[1]]
# rows_to_repeat2 = group_foreign_worker_0_df.iloc[[2]]
# rows_to_repeat3 = group_foreign_worker_0_df.iloc[[3]]
# rows_to_repeat4 = group_foreign_worker_0_df.iloc[[4]]
# rows_to_repeat5 = group_foreign_worker_0_df.iloc[[5]]

# df = pd.concat([rows_to_repeat5, rows_to_repeat1, rows_to_repeat2, rows_to_repeat3, rows_to_repeat4, df], ignore_index=True)


In [14]:
# groups_foreign_worker =  df.groupby(df['foreign_worker'])
# group_foreign_worker_0_df = groups_foreign_worker.get_group(0)

# rows_to_repeat1 = group_foreign_worker_0_df.iloc[[6]]
# rows_to_repeat2 = group_foreign_worker_0_df.iloc[[7]]
# rows_to_repeat3 = group_foreign_worker_0_df.iloc[[8]]
# rows_to_repeat4 = group_foreign_worker_0_df.iloc[[9]]
# rows_to_repeat5 = group_foreign_worker_0_df.iloc[[10]]

# df = pd.concat([rows_to_repeat5, rows_to_repeat1, rows_to_repeat2, rows_to_repeat3, rows_to_repeat4, df], ignore_index=True)


In [15]:
# groups_foreign_worker =  df.groupby(df['foreign_worker'])
# group_foreign_worker_0_df = groups_foreign_worker.get_group(0)

# rows_to_repeat1 = group_foreign_worker_0_df.iloc[[11]]
# rows_to_repeat2 = group_foreign_worker_0_df.iloc[[12]]
# rows_to_repeat3 = group_foreign_worker_0_df.iloc[[13]]
# rows_to_repeat4 = group_foreign_worker_0_df.iloc[[14]]
# rows_to_repeat5 = group_foreign_worker_0_df.iloc[[15]]

# df = pd.concat([rows_to_repeat5, rows_to_repeat1, rows_to_repeat2, rows_to_repeat3, rows_to_repeat4, df], ignore_index=True)


In [16]:
# groups_foreign_worker =  df.groupby(df['foreign_worker'])
# group_foreign_worker_0_df = groups_foreign_worker.get_group(0)

# rows_to_repeat1 = group_foreign_worker_0_df.iloc[[1]]
# rows_to_repeat2 = group_foreign_worker_0_df.iloc[[2]]
# rows_to_repeat3 = group_foreign_worker_0_df.iloc[[3]]
# rows_to_repeat4 = group_foreign_worker_0_df.iloc[[4]]
# rows_to_repeat5 = group_foreign_worker_0_df.iloc[[5]]

# df = pd.concat([rows_to_repeat5, rows_to_repeat1, rows_to_repeat2, rows_to_repeat3, rows_to_repeat4, df], ignore_index=True)

In [17]:
# new_rows1 = pd.DataFrame({
#     'status': [3],
#     'duration': [0],
#     'credit_hist': [0],
#     'credit_amt': [1],
#     'savings': [4],
#     'employment': [2],
#     'installment_rate': [2],
#     'debtors': [0],
#     'residencesince': [1],
#     'property': [1],
#     'age': [0],
#     'install_plans': [0],
#     'existing_credits': [2],
#     'job': [2],
#     'maintenance_paying_people': [1],
#     'telephone': [0],
#     'foreign_worker': [0],
#     'gender': [1]
# })

# new_rows2 = pd.DataFrame({
#     'status': [0],
#     'duration': [0],
#     'credit_hist': [0],
#     'credit_amt': [0],
#     'savings': [1],
#     'employment': [4],
#     'installment_rate': [3],
#     'debtors': [2],
#     'residencesince': [4],
#     'property': [3],
#     'age': [1],
#     'install_plans': [0],
#     'existing_credits': [2],
#     'job': [2],
#     'maintenance_paying_people': [2],
#     'telephone': [0],
#     'foreign_worker': [0],
#     'gender': [1]
# })

# new_rows3 = pd.DataFrame({
#     'status': [1],
#     'duration': [0],
#     'credit_hist': [0],
#     'credit_amt': [1],
#     'savings': [3],
#     'employment': [1],
#     'installment_rate': [2],
#     'debtors': [1],
#     'residencesince': [4],
#     'property': [2],
#     'age': [1],
#     'install_plans': [0],
#     'existing_credits': [2],
#     'job': [2],
#     'maintenance_paying_people': [1],
#     'telephone': [0],
#     'foreign_worker': [0],
#     'gender': [1]
# })


# new_rows4 = pd.DataFrame({
#     'status': [0],
#     'duration': [2],
#     'credit_hist': [1],
#     'credit_amt': [2],
#     'savings': [4],
#     'employment': [2],
#     'installment_rate': [1],
#     'debtors': [0],
#     'residencesince': [4],
#     'property': [3],
#     'age': [0],
#     'install_plans': [0],
#     'existing_credits': [1],
#     'job': [2],
#     'maintenance_paying_people': [1],
#     'telephone': [1],
#     'foreign_worker': [0],
#     'gender': [0]
# })


# new_rows5 = pd.DataFrame({
#     'status': [3],
#     'duration': [0],
#     'credit_hist': [0],
#     'credit_amt': [1],
#     'savings': [4],
#     'employment': [2],
#     'installment_rate': [2],
#     'debtors': [0],
#     'residencesince': [1],
#     'property': [1],
#     'age': [0],
#     'install_plans': [0],
#     'existing_credits': [2],
#     'job': [2],
#     'maintenance_paying_people': [1],
#     'telephone': [0],
#     'foreign_worker': [0],
#     'gender': [1]
# })


# new_rows6 = pd.DataFrame({
#     'status': [0],
#     'duration': [0],
#     'credit_hist': [0],
#     'credit_amt': [0],
#     'savings': [1],
#     'employment': [4],
#     'installment_rate': [3],
#     'debtors': [2],
#     'residencesince': [4],
#     'property': [3],
#     'age': [1],
#     'install_plans': [0],
#     'existing_credits': [2],
#     'job': [2],
#     'maintenance_paying_people': [2],
#     'telephone': [0],
#     'foreign_worker': [0],
#     'gender': [1]
# })

# # concatenate the original dataframe with the new rows
# df = pd.concat([df, new_rows1, new_rows2, new_rows3, new_rows4, new_rows5, new_rows6], ignore_index=True)


In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=15)

In [19]:
# #Add data points to x_train and y_train

# groups_foreign_worker =  df.groupby(['foreign_worker', 'gender'])
# group_foreign_worker_0_df = groups_foreign_worker.get_group((0,0))

# y_foreign0 = group_foreign_worker_0_df[['result']]

# x_foreign0 = group_foreign_worker_0_df.drop(["result"], axis=1)

In [20]:
#Add data points to x_train and y_train

groups_foreign_worker =  df.groupby(['foreign_worker', 'telephone'])
group_foreign_worker0_telephone1_df = groups_foreign_worker.get_group((0,1))

y_foreign0 = group_foreign_worker0_telephone1_df[['result']]

x_foreign0 = group_foreign_worker0_telephone1_df.drop(["result"], axis=1)

In [21]:
group_foreign_worker0_telephone1_df.shape[0]

5

In [22]:
#Add data points to x_train and y_train

x1 = x_foreign0.iloc[[1]]
x2 = x_foreign0.iloc[[2]]
x3 = x_foreign0.iloc[[3]]
x4 = x_foreign0.iloc[[4]]
x5 = x_foreign0.iloc[[0]]

y1 = y_foreign0.iloc[[1]]
y2 = y_foreign0.iloc[[2]]
y3 = y_foreign0.iloc[[3]]
y4 = y_foreign0.iloc[[4]]
y5 = y_foreign0.iloc[[0]]


x13 = x_foreign0.iloc[[1]]
x14 = x_foreign0.iloc[[2]]


y13 = y_foreign0.iloc[[1]]
y14 = y_foreign0.iloc[[2]]


x_train = pd.concat([x1, x2, x3, x4, x5, x13, x14, x_train], ignore_index=True)
y_train = pd.concat([y1, y2, y3, y4, y5, y13, y14, y_train], ignore_index=True)


In [23]:
#Add data points to x_train and y_train

# x6 = x_foreign0.iloc[[6]]
# x7 = x_foreign0.iloc[[7]]
# x8 = x_foreign0.iloc[[8]]
# x9 = x_foreign0.iloc[[9]]
# x10 = x_foreign0.iloc[[10]]

# y6 = y_foreign0.iloc[[6]]
# y7 = y_foreign0.iloc[[7]]
# y8 = y_foreign0.iloc[[8]]
# y9 = y_foreign0.iloc[[9]]
# y10 = y_foreign0.iloc[[10]]

# x16 = x_foreign0.iloc[[16]]
# x17 = x_foreign0.iloc[[17]]
# x18 = x_foreign0.iloc[[18]]
# x19 = x_foreign0.iloc[[19]]
# x20 = x_foreign0.iloc[[20]]

# y16 = y_foreign0.iloc[[16]]
# y17 = y_foreign0.iloc[[17]]
# y18 = y_foreign0.iloc[[18]]
# y19 = y_foreign0.iloc[[19]]
# y20 = y_foreign0.iloc[[20]]


# x_train = pd.concat([x6, x7, x8, x9, x10, x16, x17, x18, x19, x20, x_train], ignore_index=True)
# y_train = pd.concat([y6, y7, y8, y9, y10, y16, y17, y18, y19, y20, y_train], ignore_index=True)


In [24]:
x_train.columns

Index(['status', 'duration', 'credit_hist', 'credit_amt', 'savings',
       'employment', 'installment_rate', 'debtors', 'residencesince',
       'property', 'age', 'install_plans', 'existing_credits', 'job',
       'maintenance_paying_people', 'telephone', 'foreign_worker', 'gender'],
      dtype='object')

In [25]:
df.head()


Unnamed: 0,status,duration,credit_hist,credit_amt,savings,employment,installment_rate,debtors,residencesince,property,age,install_plans,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,result,gender
0,0,0,0,0,4,4,4,0,4,3,1,0,2,2,1,1,1,1,1
1,1,3,2,2,0,2,2,0,2,3,0,0,1,2,1,0,1,2,0
2,3,0,0,1,0,3,2,0,3,3,1,0,1,1,2,0,1,1,1
3,0,3,2,2,0,3,2,2,4,2,1,0,1,2,2,0,1,1,1
4,0,1,1,1,0,2,3,0,4,0,1,0,2,2,2,0,1,2,1


In [26]:
df.shape[0]

1000

In [27]:
logistic_model = LogisticRegression()

In [28]:
logistic_model = LogisticRegression()

In [29]:
y.head()

Unnamed: 0,result
0,1
1,2
2,1
3,1
4,2


In [30]:
x_train

Unnamed: 0,status,duration,credit_hist,credit_amt,savings,employment,installment_rate,debtors,residencesince,property,age,install_plans,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,gender
0,3,0,2,0,0,2,1,0,4,2,0,0,1,2,1,1,0,1
1,0,1,2,2,0,4,1,0,4,2,1,0,1,3,1,1,0,1
2,3,0,2,0,0,2,2,0,2,3,0,0,1,2,1,1,0,1
3,1,3,4,2,0,2,1,0,2,2,0,1,1,3,1,1,0,0
4,0,1,2,2,4,1,1,0,2,2,0,0,1,2,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,1,3,2,2,0,2,2,0,2,1,0,0,1,2,1,1,1,1
703,3,2,1,2,4,1,4,0,4,1,0,0,2,2,1,1,1,1
704,3,1,1,1,0,1,1,0,2,2,1,0,2,1,1,0,1,0
705,2,1,0,0,4,2,3,0,4,0,0,0,2,2,1,1,1,1


In [31]:
logistic_model.fit(x_train, y_train.values.ravel())


In [32]:
y_pred = logistic_model.predict(x_test)

In [33]:
y_pred_prob = logistic_model.predict_proba(x_test)

In [34]:
y_pred_prob

array([[0.57668634, 0.42331366],
       [0.18772917, 0.81227083],
       [0.95364757, 0.04635243],
       [0.81934288, 0.18065712],
       [0.72783286, 0.27216714],
       [0.93335974, 0.06664026],
       [0.79989719, 0.20010281],
       [0.13504948, 0.86495052],
       [0.33146883, 0.66853117],
       [0.31547263, 0.68452737],
       [0.76390631, 0.23609369],
       [0.86627537, 0.13372463],
       [0.24471837, 0.75528163],
       [0.53903862, 0.46096138],
       [0.5732234 , 0.4267766 ],
       [0.88024094, 0.11975906],
       [0.88622602, 0.11377398],
       [0.58459435, 0.41540565],
       [0.82993607, 0.17006393],
       [0.95856005, 0.04143995],
       [0.29685584, 0.70314416],
       [0.86490901, 0.13509099],
       [0.50951285, 0.49048715],
       [0.59252263, 0.40747737],
       [0.89096156, 0.10903844],
       [0.92445902, 0.07554098],
       [0.82190707, 0.17809293],
       [0.88206116, 0.11793884],
       [0.94228278, 0.05771722],
       [0.71784154, 0.28215846],
       [0.

In [35]:
accuracy_score(y_test, y_pred)

0.7566666666666667

In [36]:
x_test.iloc[0]

status                       0
duration                     1
credit_hist                  0
credit_amt                   0
savings                      0
employment                   4
installment_rate             4
debtors                      0
residencesince               3
property                     1
age                          0
install_plans                0
existing_credits             2
job                          2
maintenance_paying_people    1
telephone                    1
foreign_worker               1
gender                       1
Name: 825, dtype: int64

In [37]:
groups_age =  df.groupby(df['age'])

In [38]:
protected_group_age_df = groups_age.get_group(1)

In [39]:
protected_group_age_df.iloc[1]
# According to the result it shows that the division is done correctly as age is coming as 1 for protected group

status                       3
duration                     0
credit_hist                  0
credit_amt                   1
savings                      0
employment                   3
installment_rate             2
debtors                      0
residencesince               3
property                     3
age                          1
install_plans                0
existing_credits             1
job                          1
maintenance_paying_people    2
telephone                    0
foreign_worker               1
result                       1
gender                       1
Name: 2, dtype: int64

In [40]:
non_protected_group_age_df = groups_age.get_group(0)

In [41]:
non_protected_group_age_df.iloc[1]
# According to the result it shows that the division is done correctly as age is coming as 0 for non-protected group

status                       3
duration                     2
credit_hist                  2
credit_amt                   2
savings                      4
employment                   2
installment_rate             2
debtors                      0
residencesince               4
property                     0
age                          0
install_plans                0
existing_credits             1
job                          1
maintenance_paying_people    2
telephone                    1
foreign_worker               1
result                       1
gender                       1
Name: 5, dtype: int64

In [42]:
y_pred = logistic_model.predict(x_test)

In [43]:
y_pred

array([1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1,
       2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2,
       1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], d

In [44]:
# There are various fairness metrics defined in the 'Fairness Definitions Explained' paper based on the predicted probabilities and actual outcome

# 1. Test fairness -> P(Y=1|S=s, G=m) = P(Y=1|S=s, G=f), which can be used for age(protected and non-protected classes)
# 2. Well-calibration -> P(Y=1|S=s, G=m) = P(Y=1|S=s, G=f) = s, which can be used for age(protected and non-protected classes)
# 3. Balance for positive class -> E(S|Y=1,G=m) = E(S|Y=1,G=f), which can be used for age(protected and non-protected classes)
# 4. Balance for negative class -> E(S|Y=0,G=m) = E(S|Y=0,G=f), which can be used for age(protected and non-protected classes)

In [45]:
# Now, we need to find the P(ŷ = 1 | y = 1, G = 0) and P(ŷ = 1 | y = 1, G = 1) as G is the sensitive attribute here

In [46]:
# Find the average y_pred_proba where the actual outcome Y = 1 for the divided groups 

In [47]:
non_protected_group_age_df.head()

Unnamed: 0,status,duration,credit_hist,credit_amt,savings,employment,installment_rate,debtors,residencesince,property,age,install_plans,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,result,gender
1,1,3,2,2,0,2,2,0,2,3,0,0,1,2,1,0,1,2,0
5,3,2,2,2,4,2,2,0,4,0,0,0,1,1,2,1,1,1,1
7,1,2,2,2,0,2,2,0,2,1,0,0,1,3,1,1,1,1,1
9,1,2,0,2,0,0,4,0,2,1,0,0,2,3,1,0,1,2,1
10,1,0,2,0,0,1,3,0,1,1,0,0,1,2,1,0,1,2,0


In [48]:
non_protected_group_age_credresult_1_df = non_protected_group_age_df.groupby(non_protected_group_age_df['result']).get_group(1)

In [49]:
non_protected_group_age_credresult_1_df.head()

Unnamed: 0,status,duration,credit_hist,credit_amt,savings,employment,installment_rate,debtors,residencesince,property,age,install_plans,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,result,gender
5,3,2,2,2,4,2,2,0,4,0,0,0,1,1,2,1,1,1,1
7,1,2,2,2,0,2,2,0,2,1,0,0,1,3,1,1,1,1,1
12,1,0,2,0,0,2,1,0,1,1,0,0,1,2,1,1,1,1,0
14,0,1,2,0,0,2,2,0,4,1,0,0,1,2,1,0,1,1,0
17,0,2,4,2,4,1,2,0,3,1,0,1,3,2,1,0,1,1,1


In [50]:
#Drop the result column as the logistic regression model will accept 18 columns as input.
non_protected_group_age_credresult_1_df = non_protected_group_age_credresult_1_df.drop(["result"], axis=1)

In [51]:
# Need to find the predictive probability for the dataframe and then finally find the average
y_pred_non_protected_group_age_credresult_1 = logistic_model.predict(non_protected_group_age_credresult_1_df)

In [52]:
non_protected_group_age_credresult_1_df

Unnamed: 0,status,duration,credit_hist,credit_amt,savings,employment,installment_rate,debtors,residencesince,property,age,install_plans,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,gender
5,3,2,2,2,4,2,2,0,4,0,0,0,1,1,2,1,1,1
7,1,2,2,2,0,2,2,0,2,1,0,0,1,3,1,1,1,1
12,1,0,2,0,0,2,1,0,1,1,0,0,1,2,1,1,1,0
14,0,1,2,0,0,2,2,0,4,1,0,0,1,2,1,0,1,0
17,0,2,4,2,4,1,2,0,3,1,0,1,3,2,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0,2,2,1,0,0,4,0,3,2,0,0,1,3,1,1,1,1
995,3,0,2,0,0,3,3,0,4,3,0,0,1,1,1,0,1,0
996,0,2,2,1,0,2,4,0,4,2,0,0,1,3,1,1,1,1
997,3,0,2,0,0,4,4,0,4,1,0,0,1,2,1,0,1,1


In [53]:
y_pred_non_protected_group_age_credresult_1

array([1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [54]:
#avergae predicted probability came out to be 0.879781 for non protected group

In [55]:
protected_group_age_df.head()

Unnamed: 0,status,duration,credit_hist,credit_amt,savings,employment,installment_rate,debtors,residencesince,property,age,install_plans,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,result,gender
0,0,0,0,0,4,4,4,0,4,3,1,0,2,2,1,1,1,1,1
2,3,0,0,1,0,3,2,0,3,3,1,0,1,1,2,0,1,1,1
3,0,3,2,2,0,3,2,2,4,2,1,0,1,2,2,0,1,1,1
4,0,1,1,1,0,2,3,0,4,0,1,0,2,2,2,0,1,2,1
6,3,1,2,1,2,4,3,0,4,2,1,0,1,2,1,0,1,1,1


In [56]:
protected_group_age_credresult_1_df = protected_group_age_df.groupby(protected_group_age_df['result']).get_group(1)

In [57]:
#Drop the result column as the logistic regression model will accept 18 columns as input.
protected_group_age_credresult_1_df = protected_group_age_credresult_1_df.drop(["result"], axis=1)

In [58]:
# Need to find the predict probability for the dataframe and then finally find the average
y_pred_protected_group_age_credresult_1 = logistic_model.predict(protected_group_age_credresult_1_df)

In [59]:
y_pred_protected_group_age_credresult_1

array([1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [60]:
#avergae predicted probability came out to be 0.92053 for protected group

In [61]:
# Need to find the predict probability for the dataframe and then finally find the average
y_pred_protected_group_age_credresult_1 = logistic_model.predict_proba(protected_group_age_credresult_1_df)

In [62]:
y_pred_protected_group_age_credresult_1

array([[0.87145086, 0.12854914],
       [0.95753651, 0.04246349],
       [0.38348983, 0.61651017],
       [0.91755495, 0.08244505],
       [0.96777551, 0.03222449],
       [0.9589204 , 0.0410796 ],
       [0.89815473, 0.10184527],
       [0.78659616, 0.21340384],
       [0.97925811, 0.02074189],
       [0.93823975, 0.06176025],
       [0.81319224, 0.18680776],
       [0.98259003, 0.01740997],
       [0.50623729, 0.49376271],
       [0.58965585, 0.41034415],
       [0.81280799, 0.18719201],
       [0.87793041, 0.12206959],
       [0.61637216, 0.38362784],
       [0.64899125, 0.35100875],
       [0.94667193, 0.05332807],
       [0.73270868, 0.26729132],
       [0.86627537, 0.13372463],
       [0.97002591, 0.02997409],
       [0.949423  , 0.050577  ],
       [0.8769073 , 0.1230927 ],
       [0.64748399, 0.35251601],
       [0.92983736, 0.07016264],
       [0.94464376, 0.05535624],
       [0.87261735, 0.12738265],
       [0.41367916, 0.58632084],
       [0.54551592, 0.45448408],
       [0.

In [63]:
logistic_model.classes_

array([1, 2], dtype=int64)

In [64]:
y_pred_protected_group_age_credresult_1.transpose()[0].mean()

0.7990745747156986

In [65]:
y_pred_non_protected_group_age_credresult_1 = logistic_model.predict_proba(non_protected_group_age_credresult_1_df)

In [66]:
y_pred_non_protected_group_age_credresult_1.transpose()[0].mean()

0.7584885338793296