In [1]:
import pandas

In [2]:
features = ['checking account balance', 'duration', 'credit history',
            'purpose', 'amount', 'savings', 'employment', 'installment',
            'marital status', 'other debtors', 'residence time',
            'property', 'age', 'other installments', 'housing', 'credits',
            'job', 'persons', 'phone', 'foreign']
target = 'repaid'

Data taken from https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29

In [3]:
df = pandas.read_csv('../../data/credit/german.data', sep=' ',
                     names=features+[target])

In [4]:
numerical_features = ['duration', 'age', 'residence time', 'installment', 'amount', 'duration', 'persons', 'credits']
quantitative_features = list(filter(lambda x: x not in numerical_features, features))

In [5]:
X = pandas.get_dummies(df, columns=quantitative_features, drop_first=True)

In [6]:
encoded_features = list(filter(lambda x: x != target, X.columns))

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer

In [8]:
cv_accuracy = cross_val_score(LogisticRegression(),
                X[encoded_features],
                X[target],
                scoring=make_scorer(accuracy_score),
                cv=10)

In [9]:
cv_accuracy.mean(), cv_accuracy.std()

(0.752, 0.03994996871087637)

# Add noise by flipping random indicators

In [10]:
flip_fraction = 0.1 # flip 10%

In [11]:
import numpy as np

In [12]:
X_noise = X.copy()

In [13]:
# Since flip = 1 / flip
n_data = len(X_noise)
for t in range(n_data):
    for c in X_noise.columns:
        # We can use the same random response mechanism for all binary features
        if any(c.startswith(i) for i in quantitative_features):
            w = np.random.choice([0, 1], p=[1 - flip_fraction, flip_fraction])
            X_noise.loc[t,c] = (X_noise.loc[t,c] + w) % 2

        # For numerical features, it is different. The scaling factor should depend on k, \epsilon, and the sensitivity of that particular attribite. In this case, it's simply the range of the attribute.
        if any(c.startswith(i) for i in numerical_features):
            # calculate the range of the attribute and add the laplace noise to the original data
            w = np.random.laplace(1) # change to the right value
            X_noise.loc[t,c] = (X_noise.loc[t,c] + w) # leave unchanged


In [14]:
cv_accuracy = cross_val_score(LogisticRegression(),
                X_noise[encoded_features],
                X_noise[target],
                scoring=make_scorer(accuracy_score),
                cv=10)

In [15]:
cv_accuracy.mean(), cv_accuracy.std()

(0.6980000000000001, 0.04791659420284376)

In [16]:
X_noise - X

Unnamed: 0,duration,amount,installment,residence time,age,credits,persons,repaid,checking account balance_A12,checking account balance_A13,...,property_A124,other installments_A142,other installments_A143,housing_A152,housing_A153,job_A172,job_A173,job_A174,phone_A192,foreign_A202
0,-5.987864,-1168.826513,-2.611102,-3.292324,-66.325194,-1.032485,-0.792463,0,0,0,...,0,0,0,0,0,0,-1,0,0,0
1,-46.250862,-5949.128057,-1.936348,-1.452852,-21.889490,0.038192,-0.002809,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,-10.115781,-2094.751122,-1.152621,-2.708286,-47.780450,-0.997497,-0.559346,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,-40.359702,-7880.957241,-0.332878,-3.836391,-43.423129,-0.953919,-0.591091,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,-22.786421,-4868.899596,-2.887151,-2.542922,-51.239128,-1.800699,-1.696215,0,0,0,...,0,0,0,0,0,1,0,0,0,1
5,-35.858361,-9054.580566,-0.830632,-2.567531,-34.481050,-0.868492,-1.066869,0,1,1,...,0,0,0,1,0,0,1,0,0,0
6,-22.613133,-2834.383727,-2.872239,-3.577621,-52.492245,-0.064168,-0.853873,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,-35.997423,-6947.841215,-0.103973,-1.127422,-33.407573,-0.525331,-0.968635,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,-11.267226,-3058.324845,-1.924522,-3.396780,-60.480821,-0.965792,-0.714219,0,0,0,...,0,0,0,-1,0,0,0,0,0,0
9,-28.684551,-5233.522136,-2.221752,-0.992691,-26.971714,-1.287064,-0.737412,0,0,0,...,0,0,0,-1,0,0,0,0,0,0
