## Analysis planning

The analysis performed by delentture will be the same as performed in the first part of the project.

That being said, the columns starting with "past_" will be the key information to be passed, as well as the age

In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv("infringement_dataset_v2.csv")
df.shape

(307511, 46)

## Noise adding function

The noise added follows the pratical class on differential privacy and is based on a laplace distribution. The percentage error measures the deviation from the original data.

In [4]:
def add_laplace_noise(s, sensitivity, epsilon):
  return s + np.random.laplace(loc=0, scale=sensitivity/epsilon)

def percentage_error(orig, est):
  return (abs(np.divide(np.subtract(est, orig), orig))) * 100

Small test of the noise adding

In [5]:
epsilon = 0.1 # common values 0.01, 0.01. 0.2, ln(2), ln(3)
sens = 1

orig_count = len(df.query('age > 40'))
dp_count = add_laplace_noise(len(df.query('age > 40')), 1, 0.01)
print(orig_count, dp_count, percentage_error(orig_count, dp_count))

171588 171594.55867181954 0.00382233712120947


The values are close by a factor of less than 100, which is small comparing to the value size. The error shown is, by itself, very low, so we can trust that the analysis should be good enough.

### Original analysis

In [6]:
original_count = []
original_count.append(len(df.query('age<30')))
original_count.append(len(df.query('age<40')) - original_count[-1])
original_count.append(len(df.query('age<50')) - original_count[-1])
original_count.append(len(df.query('age<60')) - original_count[-1])
original_count.append(len(df.query('age>=60')))
print(original_count)

[45000, 82299, 121543, 150373, 35595]


In [7]:
sel1 = df[df["infringed"] == 1]
sel2 = df[df["infringed"] == 0]

original_past = []
original_past.append([sel2["past_avg_amount_annuity"].mean(), sel1["past_avg_amount_annuity"].mean()])
original_past.append([sel2["past_avg_amt_application"].mean(), sel1["past_avg_amt_application"].mean()])
original_past.append([sel2["past_avg_amt_credit"].mean(), sel1["past_avg_amt_credit"].mean()])
original_past.append([sel2["past_loans_approved"].mean(), sel1["past_loans_approved"].mean()])
original_past.append([sel2["past_loans_canceled"].mean(), sel1["past_loans_canceled"].mean()])
original_past.append([sel2["past_loans_refused"].mean(), sel1["past_loans_refused"].mean()])
original_past.append([sel2["past_loans_total"].mean(), sel1["past_loans_total"].mean()])
original_past.append([sel2["past_loans_unused"].mean(), sel1["past_loans_unused"].mean()])
print(original_past)

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]


### With differential privacy

Being aware that subtracting the previous dp.count brings more inaccuracy for the final result, but this is done this way beacause the utilitary of the dataset does not have access to the variable original_count. In that way, we expect that the error rises as the analysis progresses.

In [8]:
dp_count = []
dp_count.append(add_laplace_noise((len(df.query('age<30'))), sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<40')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<50')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<60')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age>=60')), sens, epsilon))
print(original_count, dp_count)
print(percentage_error(original_count, dp_count))

[45000, 82299, 121543, 150373, 35595] [44995.639963095986, 82296.55778743245, 121547.13030557478, 150367.1250456824, 35599.645014716334]
[0.00968897 0.00296749 0.00339823 0.00390692 0.01304963]


Even with the provision above and being subjected to a bigger deviation, the percentual error does not rise above 0.02% which is a very small value.

In [9]:
# TODO sensibility o mean

dp_past = []
dp_past.append([add_laplace_noise(sel2["past_avg_amount_annuity"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amount_annuity"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_application"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_application"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_credit"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_credit"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_approved"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_approved"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_canceled"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_canceled"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_refused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_refused"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_total"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_total"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_unused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_unused"].mean(), sens, epsilon)])

print(original_past)
print(dp_past)
print(percentage_error(original_past, dp_past))

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]
[[14635.412841066736, 13379.257386312338], [155007.5440166046, 142840.58320515518], [170856.7350417231, 161139.3153799877], [-0.6213123645852221, -2.1114557211415073], [-4.154206911174094, 0.328306426185914], [-10.739074475851062, 4.35809126247645], [12.039084809563015, 7.350390133489392], [17.1241373054472, 15.123151419331716]]
[[6.24622196e-03 1.12793192e-01]
 [2.36243195e-03 1.63326079e-02]
 [6.48348609e-03 5.92429597e-03]
 [1.20274886e+02 1.74874205e+02]
 [6.14028088e+02 7.34069341e+01]
 [1.31778874e+03 3.36633135e+02]
 [1.49119940e+02 4.32412984e+01]
 [2.18020437e+04 1.90916735e+04]]


### Verify analysis with the module Py-DP

This module is directed to differential privacy methods. The analysis is done with simple sums and means of the algorithm.

In [10]:
#para validação dos dados
from pydp.algorithms.laplacian import BoundedSum

dp_sum = BoundedSum(epsilon=epsilon, dtype ='float')

pydp_count = []
pydp_count.append(dp_sum.quick_result(df['age']<30))
pydp_count.append(dp_sum.quick_result(df['age']<40)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']<50)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']<60)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']>=60))

print(original_count)
print(pydp_count)
print(percentage_error(pydp_count, dp_count))

[45000, 82299, 121543, 150373, 35595]
[44998.05197494081, 82254.26332256728, 121529.50074160926, 150337.5448459551, 35538.98778456694]
[0.00536026 0.05141918 0.01450641 0.01967586 0.17067799]


In [11]:
from pydp.algorithms.laplacian import BoundedMean

dp_mean = BoundedMean(epsilon=epsilon, dtype='float')

pydp_past = []
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amount_annuity"].array), dp_mean.quick_result(sel1["past_avg_amount_annuity"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amt_application"].array), dp_mean.quick_result(sel1["past_avg_amt_application"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amt_credit"].array), dp_mean.quick_result(sel1["past_avg_amt_credit"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_approved"].array), dp_mean.quick_result(sel1["past_loans_approved"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_canceled"].array), dp_mean.quick_result(sel1["past_loans_canceled"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_refused"].array), dp_mean.quick_result(sel1["past_loans_refused"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_total"].array), dp_mean.quick_result(sel1["past_loans_total"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_unused"].array), dp_mean.quick_result(sel1["past_loans_unused"].array)])

print(original_past)
print(pydp_past)
print(percentage_error(pydp_past, dp_past))

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]
[[14585.907062720522, 13103.258718163306], [153843.79970645142, 135867.05032334567], [169799.30601850856, 156019.7241168861], [3.07562986344956, 2.792908314925179], [0.8026075695138655, 1.136467556474102], [0.8798850704570542, 0.8634312244568434], [4.823521784192973, 4.971546566493949], [0.07559248995133128, 0.07128045269113259]]
[[3.39408294e-01 2.10633610e+00]
 [7.56445377e-01 5.13261520e+00]
 [6.22752264e-01 3.28137439e+00]
 [1.20201142e+02 1.75600610e+02]
 [6.17588803e+02 7.11116763e+01]
 [1.32050877e+03 4.04740984e+02]
 [1.49591177e+02 4.78491660e+01]
 [2.25532256e+04 2.11164077e+04]]
