In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
cols = ['status', 'duration', 'credit_hist', 'purpose', 'credit_amt', 'savings', 'employment', 'installment_rate', 'personal_status', 'debtors', 'residencesince', 'property', 'age', 'install_plans', 'housing', 'existing_credits', 'job', 'maintenance_paying_people', 'telephone', 'foreign_worker', 'result']

In [3]:
df = pd.read_table('german.data', names=cols, sep=" ", index_col=False)

In [4]:
df.head()

Unnamed: 0,status,duration,credit_hist,purpose,credit_amt,savings,employment,installment_rate,personal_status,debtors,...,property,age,install_plans,housing,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,result
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [5]:
def preprocess_german(df): 
    df['status'] = df['status'].map({'A11': 0, 'A12': 1, 'A13': 2, 'A14': 3}).astype(int)
    df['credit_hist'] = df['credit_hist'].map({'A34': 0, 'A33': 1, 'A32': 2, 'A31': 3, 'A30': 4}).astype(int)
    df.loc[(df['credit_amt'] <= 2000), 'credit_amt'] = 0
    df.loc[(df['credit_amt'] > 2000) & (df['credit_amt'] <= 5000), 'credit_amt'] = 1
    df.loc[(df['credit_amt'] > 5000), 'credit_amt'] = 2   
    df.loc[(df['duration'] <= 12), 'duration'] = 0
    df.loc[(df['duration'] > 12) & (df['duration'] <= 24), 'duration'] = 1
    df.loc[(df['duration'] > 24) & (df['duration'] <= 36), 'duration'] = 2
    df.loc[(df['duration'] > 36), 'duration'] = 3
    df['age'] = df['age'].apply(lambda x : 1 if x >= 45 else 0) # 1 if old, 0 if young
    df['savings'] = df['savings'].map({'A61': 0, 'A62': 1, 'A63': 2, 'A64': 3, 'A65': 4}).astype(int)
    df['employment'] = df['employment'].map({'A71': 0, 'A72': 1, 'A73': 2, 'A74': 3, 'A75': 4}).astype(int)    
    df['gender'] = df['personal_status'].map({'A91': 1, 'A92': 0, 'A93': 1, 'A94': 1, 'A95': 0}).astype(int)
    df['debtors'] = df['debtors'].map({'A101': 0, 'A102': 1, 'A103': 2}).astype(int)
    df['property'] = df['property'].map({'A121': 3, 'A122': 2, 'A123': 1, 'A124': 0}).astype(int)        
    df['install_plans'] = df['install_plans'].map({'A141': 1, 'A142': 1, 'A143': 0}).astype(int)
    df['job'] = df['job'].map({'A171': 0, 'A172': 1, 'A173': 2, 'A174': 3}).astype(int)    
    df['telephone'] = df['telephone'].map({'A191': 0, 'A192': 1}).astype(int)
    df['foreign_worker'] = df['foreign_worker'].map({'A201': 1, 'A202': 0}).astype(int)
    pd.get_dummies(df, columns=['purpose', 'housing'], drop_first=True)
    
    return df



In [6]:
X = preprocess_german(df);

In [7]:
df = df.drop(["purpose", "housing"], axis=1)

In [8]:
df = df.drop(["personal_status"], axis=1)

In [9]:
# # Create an empty array for the results
# pp_array = np.array([])

# # Push values to the array
# pp_array = np.append(my_array, 10)

In [10]:
df.columns

Index(['status', 'duration', 'credit_hist', 'credit_amt', 'savings',
       'employment', 'installment_rate', 'debtors', 'residencesince',
       'property', 'age', 'install_plans', 'existing_credits', 'job',
       'maintenance_paying_people', 'telephone', 'foreign_worker', 'result',
       'gender'],
      dtype='object')

In [11]:
x = df[['status', 'duration', 'credit_hist', 'credit_amt', 'savings',
       'employment', 'installment_rate', 'debtors', 'residencesince',
       'property', 'age', 'install_plans', 'existing_credits', 'job',
       'maintenance_paying_people', 'telephone', 'foreign_worker',
       'gender']]

In [12]:
y = df[["result"]]

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=15)

In [14]:
# def get_random_data()

In [15]:
def predictive_parity(x_train, y_train):
    # find the predictive probabilities for the sensitive and non-sensitive groups 

    logistic_model = LogisticRegression(max_iter=1000)
    logistic_model.fit(x_train, y_train.values.ravel())
    y_pred = logistic_model.predict(x_test)
    y_pred_prob = logistic_model.predict_proba(x_test)
    accuracy_score(y_test, y_pred)
    groups_age =  df.groupby(df['age'])
    protected_group_age_df = groups_age.get_group(1)
    non_protected_group_age_df = groups_age.get_group(0)
    y_pred = logistic_model.predict(x_test)

    # There are various fairness metrics defined in the 'Fairness Definitions Explained' paper based on the predicted probabilities and actual outcome

    # 1. Test fairness -> P(Y=1|S=s, G=m) = P(Y=1|S=s, G=f), which can be used for age(protected and non-protected classes)
    # 2. Well-calibration -> P(Y=1|S=s, G=m) = P(Y=1|S=s, G=f) = s, which can be used for age(protected and non-protected classes)
    # 3. Balance for positive class -> E(S|Y=1,G=m) = E(S|Y=1,G=f), which can be used for age(protected and non-protected classes)
    # 4. Balance for negative class -> E(S|Y=0,G=m) = E(S|Y=0,G=f), which can be used for age(protected and non-protected classes)

    # Now, we need to find the P(ŷ = 1 | y = 1, G = 0) and P(ŷ = 1 | y = 1, G = 1) as G is the sensitive attribute here
    # Find the average y_pred_proba where the actual outcome Y = 1 for the divided groups 

    non_protected_group_age_credresult_1_df = non_protected_group_age_df.groupby(non_protected_group_age_df['result']).get_group(1)

    #Drop the result column as the logistic regression model will accept 18 columns as input.
    non_protected_group_age_credresult_1_df = non_protected_group_age_credresult_1_df.drop(["result"], axis=1)

    # Need to find the predictive probability for the dataframe and then finally find the average
    y_pred_non_protected_group_age_credresult_1 = logistic_model.predict(non_protected_group_age_credresult_1_df)

    protected_group_age_credresult_1_df = protected_group_age_df.groupby(protected_group_age_df['result']).get_group(1)

    #Drop the result column as the logistic regression model will accept 18 columns as input.
    protected_group_age_credresult_1_df = protected_group_age_credresult_1_df.drop(["result"], axis=1)

    # Need to find the predict probability for the dataframe and then finally find the average
    y_pred_protected_group_age_credresult_1 = logistic_model.predict(protected_group_age_credresult_1_df)

    # Need to find the predict probability for the dataframe and then finally find the average
    y_pred_protected_group_age_credresult_1 = logistic_model.predict_proba(protected_group_age_credresult_1_df)
    y_pred_non_protected_group_age_credresult_1 = logistic_model.predict_proba(non_protected_group_age_credresult_1_df)
    
    protected_pp = y_pred_protected_group_age_credresult_1.transpose()[0].mean()
    non_protected_pp = y_pred_non_protected_group_age_credresult_1.transpose()[0].mean()

    return protected_pp, non_protected_pp


In [16]:
#Random data acquisition - Guassian distribution 

def data_acquisition_guassian(x_train, y_train):
    for num_points in range(100, 1001, 100):
        random_data = pd.DataFrame()
        random_tuples = x.sample(n=num_points, random_state=40)

        for column in x.columns:
            mean = x[column].mean()
            std_dev = x[column].std()
            random_values = np.random.normal(mean, std_dev, num_points)

            random_values = np.clip(random_values, 0, None)
            random_values = np.round(random_values).astype(int)

            random_data[column] = random_values

        # random_data is x_rand and we need to find out the labels for them using KNN

        k = 3  # The number of neighbors to consider
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_train, y_train)

        y_rand = knn.predict(random_data)
        column_names = ['result']
        y_rand = pd.DataFrame(y_rand, columns=column_names)
        
        # y_rand = y_rand.ravel()

        x_train = pd.concat([x_train, random_data])
        y_train = pd.concat([y_train, y_rand])

        pp_protected_gp, pp_nonprotectedgp_gp, diff = predictive_parity(x_train, y_train)
        print('PP for ', num_points ,' datapoints acquired is: ', pp_protected_gp, pp_nonprotectedgp_gp, pp_protected_gp - pp_nonprotectedgp_gp )
        


In [41]:
#Random data acquisition from the dataset 

def data_acquisition_frm_dataset(x_train, y_train):
    
    pp_protected_gp, pp_nonprotectedgp_gp = predictive_parity(x_train, y_train)
    print('PP for    0  datapoints acquired is: ', pp_protected_gp, pp_nonprotectedgp_gp)

    for num_points in range(100, 1001, 100):
        
        filtered_df = df[df['gender'] == 0]
        random_tuples = filtered_df.iloc[np.random.choice(len(filtered_df), size=num_points, replace=True)] 
        
        # print(random_tuples.head())
        
        #random_tuples = df.sample(n=num_points, weights=fixed_attributes, replace = True, random_state=30)
       # random_tuples = df.sample(n=num_points)

        x_rand = random_tuples[['status', 'duration', 'credit_hist', 'credit_amt', 'savings',
           'employment', 'installment_rate', 'debtors', 'residencesince',
           'property', 'age', 'install_plans', 'existing_credits', 'job',
           'maintenance_paying_people', 'telephone', 'foreign_worker',
           'gender']]

        y_rand = random_tuples[['result']]

        x_train_df = pd.concat([x_train, x_rand])
        y_train_df = pd.concat([y_train, y_rand])


        pp_protected_gp, pp_nonprotectedgp_gp = predictive_parity(x_train_df, y_train_df)
        print('PP for ', num_points ,' datapoints acquired is: ', pp_protected_gp, pp_nonprotectedgp_gp)



In [42]:
data_acquisition_frm_dataset(x_train, y_train)

PP for    0  datapoints acquired is:  0.8061534338247007 0.7585539060046168
PP for  100  datapoints acquired is:  0.8177016846140222 0.7520159429539339
PP for  200  datapoints acquired is:  0.8126402813435551 0.7467108963863104
PP for  300  datapoints acquired is:  0.8189115935308166 0.7589706167882023
PP for  400  datapoints acquired is:  0.8111985157300712 0.7535585991866746
PP for  500  datapoints acquired is:  0.8448753425149694 0.7428950178206577
PP for  600  datapoints acquired is:  0.8328854393700754 0.7466418667633722
PP for  700  datapoints acquired is:  0.8325899476632885 0.7540120955562323
PP for  800  datapoints acquired is:  0.8396698847981866 0.7517220428015805
PP for  900  datapoints acquired is:  0.8797027319957509 0.7397703227444231
PP for  1000  datapoints acquired is:  0.8556347899166442 0.7514910990562952
