In [29]:
# this code is largely taken from Ashudeep Singh and found here: https://github.com/ashudeep/Fair-PGRank/blob/master/GermanCredit/German%20Credit%20Data%20Preprocessing.ipynb

In [1]:
! wget https://storage.googleapis.com/kaggle-forum-message-attachments/237294/7771/german_credit_data.csv

--2020-09-30 19:53:27--  https://storage.googleapis.com/kaggle-forum-message-attachments/237294/7771/german_credit_data.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.4.80, 172.217.4.48, 172.217.9.80, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.4.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53393 (52K) [text/csv]
Saving to: ‘german_credit_data.csv.1’


2020-09-30 19:53:27 (1.12 MB/s) - ‘german_credit_data.csv.1’ saved [53393/53393]



In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv("german_credit_data.csv", index_col=0)

In [3]:
df = df.fillna(value="NA")

In [4]:
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [5]:
df['age_binary'] = df.apply(lambda row: 1 if row['Age'] >= 25 else 0, axis=1)

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
preprocess = make_column_transformer(
    (StandardScaler(), ['Age', 'Credit amount', 'Duration']),
    (OneHotEncoder(sparse=False), ['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Risk', 'age_binary'])
)

In [7]:
mat = preprocess.fit_transform(df)
#np.random.shuffle(mat)

In [8]:
mat[0,:]

array([ 2.76645648, -0.74513141, -1.23647786,  0.        ,  1.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  1.        ])

In [9]:
X = mat[:, :-4]
Y = mat[:, -1-2]
age = mat[:, -1]

In [10]:
X[0,:]

array([ 2.76645648, -0.74513141, -1.23647786,  0.        ,  1.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ])

In [11]:
num_feats = X.shape[1]
numX = X.shape[0]

In [12]:
datasize = 500 #number of queries in training
query_size = 10 # number of items per query
split_on_doc = 0.8 #seperates items for training/testing
testsize = 100 # number of queries in testing
ratio_relevant = .4 # out of the 10 items only 20% or 2 of them are relevant
ratios_col = Y * ratio_relevant + (1-Y)*(1-ratio_relevant)

In [13]:
def get_weights(Y, ratio_relevant):
    num_relevant = np.sum(Y == 1)
    num_not_relevant = len(Y) - num_relevant
    y = (1-ratio_relevant) / num_not_relevant
    x = ratio_relevant / num_relevant
    ratios_col = Y * x + (1-Y)* y
    return ratios_col
    

In [14]:
# generate a candidate set of size 10 everytime
data_X = np.zeros((datasize*query_size, num_feats))
data_Y = np.zeros(datasize*query_size)

test_X = np.zeros((testsize*query_size, num_feats))
test_Y = np.zeros(testsize*query_size)

group_identities_train = np.zeros(datasize*query_size)
group_identities_test = np.zeros(testsize*query_size)
print("Sampling between 0 and {} for train".format(numX*split_on_doc))

p = get_weights(Y[:int(numX*split_on_doc)], ratio_relevant)
#p = ratios_col[0:int(numX*split_on_doc)]
#p = p / np.sum(p)

for i in range(datasize):
    cs_indices = np.random.choice(np.arange(0, numX*split_on_doc, dtype=int), size=query_size, p=p)
    while np.sum(Y[cs_indices]) == 0:
        cs_indices = np.random.choice(np.arange(0, numX*split_on_doc, dtype=int), size=query_size, p=p)
    data_X[i*query_size:(i+1)*query_size, :] = X[cs_indices, :]
    data_Y[i*query_size:(i+1)*query_size] = Y[cs_indices]
    group_identities_train[i*query_size:(i+1)*query_size] = X[cs_indices,4]
    
print("Sampling between {} and {} for test".format(numX*split_on_doc, numX))
p = get_weights(Y[int(numX*split_on_doc):], ratio_relevant)
#p = ratios_col[int(numX*split_on_doc):]
#p = p/sum(p)

for i in range(testsize):
    cs_indices = np.random.choice(np.arange(0, numX*(1-split_on_doc), dtype=int), size=query_size, p=p)
    while np.sum(Y[int(numX*split_on_doc) + cs_indices]) == 0:
        cs_indices = np.random.choice(np.arange(0, numX*(1-split_on_doc), dtype=int), size=query_size, p=p)
    test_X[i*query_size:(i+1)*query_size, :] = X[int(numX*split_on_doc) + cs_indices, :]
    test_Y[i*query_size:(i+1)*query_size] = Y[int(numX*split_on_doc) + cs_indices]
    group_identities_test[i*query_size:(i+1)*query_size] = X[int(numX*split_on_doc) + cs_indices,4]

Sampling between 0 and 800.0 for train
Sampling between 800.0 and 1000 for test


In [15]:
import pickle as pkl
pkl.dump((data_X, data_Y, group_identities_train), open("german_train_rank.pkl", "wb"))
pkl.dump((test_X, test_Y, group_identities_test), open("german_test_rank.pkl", "wb"))
