## Analysis planning

The analysis performed by delentture will be the same as performed in the first part of the project.

That being said, the columns starting with "past_" will be the key information to be passed, as well as the age

In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv("infringement_dataset_v2.csv")
df.shape

(307511, 46)

## Noise adding function

The noise added follows the pratical class on differential privacy and is based on a laplace distribution. The percentage error measures the deviation from the original data.

In [4]:
def add_laplace_noise(s, sensitivity, epsilon):
  return s + np.random.laplace(loc=0, scale=sensitivity/epsilon)

def percentage_error(orig, est):
  return (abs(np.divide(np.subtract(est, orig), orig))) * 100

Small test of the noise adding

In [27]:
epsilon = 0.1 # common values 0.01, 0.1. 0.2, ln(2), ln(3)

orig_count = len(df.query('age > 40'))
dp_count = add_laplace_noise(len(df.query('age > 40')), 1, epsilon)
print(orig_count, dp_count, percentage_error(orig_count, dp_count))

171588 171602.80113099792 0.008625970929155445


The values are close by a factor of less than 100, which is small comparing to the value size. The error shown is, by itself, very low, so we can trust that the analysis should be good enough.

### Original analysis

In [6]:
original_count = []
original_count.append(len(df.query('age<30')))
original_count.append(len(df.query('age<40')) - original_count[-1])
original_count.append(len(df.query('age<50')) - original_count[-1])
original_count.append(len(df.query('age<60')) - original_count[-1])
original_count.append(len(df.query('age>=60')))
print(original_count)

[45000, 82299, 121543, 150373, 35595]


In [7]:
sel1 = df[df["infringed"] == 1]
sel2 = df[df["infringed"] == 0]

original_past = []
original_past.append([sel2["past_avg_amount_annuity"].mean(), sel1["past_avg_amount_annuity"].mean()])
original_past.append([sel2["past_avg_amt_application"].mean(), sel1["past_avg_amt_application"].mean()])
original_past.append([sel2["past_avg_amt_credit"].mean(), sel1["past_avg_amt_credit"].mean()])
original_past.append([sel2["past_loans_approved"].mean(), sel1["past_loans_approved"].mean()])
original_past.append([sel2["past_loans_canceled"].mean(), sel1["past_loans_canceled"].mean()])
original_past.append([sel2["past_loans_refused"].mean(), sel1["past_loans_refused"].mean()])
original_past.append([sel2["past_loans_total"].mean(), sel1["past_loans_total"].mean()])
original_past.append([sel2["past_loans_unused"].mean(), sel1["past_loans_unused"].mean()])
print(original_past)

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]


### With differential privacy

Being aware that subtracting the previous dp.count brings more inaccuracy for the final result, but this is done this way beacause the utilitary of the dataset does not have access to the variable original_count. In that way, we expect that the error rises as the analysis progresses.

For a sum of queries, the sensitivity is 1, since adding 1 count will reproduce a change in 1 of the output.

In [25]:
dp_count = []
dp_count.append(add_laplace_noise((len(df.query('age<30'))), sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<40')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<50')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<60')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age>=60')), sens, epsilon))
print(original_count, dp_count)
print(percentage_error(original_count, dp_count))

[45000, 82299, 121543, 150373, 35595] [45000.202276793265, 82298.82170121225, 121542.88445627906, 150373.13790705358, 35596.686714695956]
[4.49503985e-04 2.16647575e-04 9.50640686e-05 9.17099836e-05
 4.73862817e-03]


Even with the provision above and being subjected to a bigger deviation, the percentual error does not rise above 0.02% which is a very small value.

In [24]:
dp_past = []
dp_past.append([add_laplace_noise(sel2["past_avg_amount_annuity"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amount_annuity"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_application"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_application"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_credit"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_credit"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_approved"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_approved"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_canceled"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_canceled"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_refused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_refused"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_total"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_total"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_unused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_unused"].mean(), sens, epsilon)])

print(original_past)
print(dp_past)
print(percentage_error(original_past, dp_past))

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]
[[14636.153097272088, 13363.863051456967], [155000.78065799942, 142816.8345939904], [170867.27535738779, 161128.31633468904], [3.153203552065636, 2.8321720750725365], [1.1126680995275973, 1.5819851801010612], [0.8364598071743952, 1.2895023846977298], [4.192050683589169, 7.22662707091109], [-1.3449347196054027, -0.6807168492152992]]
[[1.18855823e-03 2.39779499e-03]
 [2.00091593e-03 2.96062600e-04]
 [3.14790266e-04 9.01907109e-04]
 [2.89645891e+00 4.31484512e-01]
 [3.76779415e+01 2.81419819e+01]
 [5.14719510e+00 2.91940519e+01]
 [1.32555810e+01 4.08294561e+01]
 [1.82019288e+03 9.63847433e+02]]


Alterando epsilon

In [28]:
epsilon = np.log(3)

dp_past = []
dp_past.append([add_laplace_noise(sel2["past_avg_amount_annuity"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amount_annuity"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_application"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_application"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_credit"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_credit"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_approved"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_approved"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_canceled"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_canceled"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_refused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_refused"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_total"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_total"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_unused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_unused"].mean(), sens, epsilon)])

print(original_past)
print(dp_past)
print(percentage_error(original_past, dp_past))

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]
[[14636.44461010259, 13365.259524286352], [155006.21050275993, 142817.0096811231], [170866.58271282903, 161130.4578591386], [3.501098710545337, 3.166148718938698], [0.4257369612563883, 1.9489664308625292], [1.736900013394139, 1.1619251725752715], [4.064218008230653, 4.1861208833537304], [-0.12983790413726, 3.1211171534811215]]
[[8.03149320e-04 8.05157387e-03]
 [1.50212198e-03 1.73467379e-04]
 [7.20158923e-04 4.27161042e-04]
 [1.42490974e+01 1.22746103e+01]
 [4.73206894e+01 5.78677374e+01]
 [9.69608542e+01 1.64122090e+01]
 [1.59007712e+01 1.84226443e+01]
 [2.66064743e+02 3.86077906e+03]]


### Verify analysis with the module Py-DP

This module is directed to differential privacy methods. The analysis is done with simple sums and means of the algorithm.

In [30]:
from pydp.algorithms.laplacian import BoundedSum

epsilon = 0.1
dp_sum = BoundedSum(epsilon=epsilon, dtype ='int') 
# dtype int returns the pydp sum as an integer, as it is a sum of integers

pydp_count = []
pydp_count.append(dp_sum.quick_result(df['age']<30))
pydp_count.append(dp_sum.quick_result(df['age']<40)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']<50)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']<60)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']>=60))

print(original_count)
print(pydp_count)
print(percentage_error(pydp_count, dp_count))

[45000, 82299, 121543, 150373, 35595]
[44941, 82243, 121543, 150383, 35514]
[281.84019299 108.65338221  41.18690598  14.11050526 383.19761539]


In [29]:
from pydp.algorithms.laplacian import BoundedMean

dp_mean = BoundedMean(epsilon=epsilon, dtype='float')

pydp_past = []
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amount_annuity"].array), dp_mean.quick_result(sel1["past_avg_amount_annuity"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amt_application"].array), dp_mean.quick_result(sel1["past_avg_amt_application"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amt_credit"].array), dp_mean.quick_result(sel1["past_avg_amt_credit"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_approved"].array), dp_mean.quick_result(sel1["past_loans_approved"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_canceled"].array), dp_mean.quick_result(sel1["past_loans_canceled"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_refused"].array), dp_mean.quick_result(sel1["past_loans_refused"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_total"].array), dp_mean.quick_result(sel1["past_loans_total"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_unused"].array), dp_mean.quick_result(sel1["past_loans_unused"].array)])

print(original_past)
print(pydp_past)
print(percentage_error(pydp_past, dp_past))

KeyboardInterrupt: 