## Analysis planning

The analysis performed by delentture will be the same as performed in the first part of the project.

That being said, the columns starting with "past_" will be the key information to be passed, as well as the age

In [11]:
import pandas as pd
import numpy as np


In [12]:
df = pd.read_csv("infringement_dataset_v2.csv")
df.shape

(307511, 46)

## Noise adding function

The noise added follows the pratical class on differential privacy and is based on a laplace distribution. The percentage error measures the deviation from the original data.

In [27]:
def add_laplace_noise(s, sensitivity, epsilon):
  return s + np.random.laplace(loc=0, scale=sensitivity/epsilon)

def percentage_error(orig, est):
  return (abs(np.divide(np.subtract(est, orig), orig))) * 100

Small test of the noise adding

In [14]:
epsilon = 0.1 # common values 0.01, 0.01. 0.2, ln(2), ln(3)
sens = 1

orig_count = len(df.query('age > 40'))
dp_count = add_laplace_noise(len(df.query('age > 40')), 1, 0.01)
print(orig_count, dp_count, percentage_error(orig_count, dp_count))

171588 171419.00073232315 0.09849130922724503


The values are close by a factor of less than 100, which is small comparing to the value size. The error shown is, by itself, very low, so we can trust that the analysis should be good enough.

### Original analysis

In [15]:
original_count = []
original_count.append(len(df.query('age<30')))
original_count.append(len(df.query('age<40')) - original_count[-1])
original_count.append(len(df.query('age<50')) - original_count[-1])
original_count.append(len(df.query('age<60')) - original_count[-1])
original_count.append(len(df.query('age>=60')))
print(original_count)

[45000, 82299, 121543, 150373, 35595]


In [16]:
sel1 = df[df["infringed"] == 1]
sel2 = df[df["infringed"] == 0]

original_past = []
original_past.append([sel2["past_avg_amount_annuity"].mean(), sel1["past_avg_amount_annuity"].mean()])
original_past.append([sel2["past_avg_amt_application"].mean(), sel1["past_avg_amt_application"].mean()])
original_past.append([sel2["past_avg_amt_credit"].mean(), sel1["past_avg_amt_credit"].mean()])
original_past.append([sel2["past_loans_approved"].mean(), sel1["past_loans_approved"].mean()])
original_past.append([sel2["past_loans_canceled"].mean(), sel1["past_loans_canceled"].mean()])
original_past.append([sel2["past_loans_refused"].mean(), sel1["past_loans_refused"].mean()])
original_past.append([sel2["past_loans_total"].mean(), sel1["past_loans_total"].mean()])
original_past.append([sel2["past_loans_unused"].mean(), sel1["past_loans_unused"].mean()])
print(original_past)

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]


### With differential privacy

Being aware that subtracting the previous dp.count brings more inaccuracy for the final result, but this is done this way beacause the utilitary of the dataset does not have access to the variable original_count

In [17]:
dp_count = []
dp_count.append(add_laplace_noise((len(df.query('age<30'))), sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<40')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<50')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<60')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age>=60')), sens, epsilon))
print(original_count, dp_count)

[45000, 82299, 121543, 150373, 35595] [45014.71547930083, 82234.06709420051, 121622.19477728399, 150272.24507805012, 35598.71107113981]


In [28]:
dp_past = []
dp_past.append([add_laplace_noise(sel2["past_avg_amount_annuity"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amount_annuity"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_application"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_application"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_credit"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_credit"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_approved"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_approved"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_canceled"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_canceled"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_refused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_refused"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_total"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_total"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_unused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_unused"].mean(), sens, epsilon)])

print(original_past)
print(dp_past)
print(percentage_error(original_past, dp_past))

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]
[[14643.21356691083, 13360.728906094606], [155025.01420329072, 142863.65661797664], [170866.43346334886, 161138.14763295447], [11.948008879429809, -16.77423561920931], [15.315537344100687, -1.0672229595294158], [-1.0582343409952284, 11.028460230111367], [5.6984801974722075, 4.606515781309634], [-22.367903314522945, 2.6509124299569673]]
[[4.70507959e-02 2.58496233e-02]
 [1.36332378e-02 3.24885076e-02]
 [8.07506842e-04 5.19957140e-03]
 [2.89891672e+02 6.94830166e+02]
 [1.79509491e+03 1.86445857e+02]
 [2.20001576e+02 1.00493124e+03]
 [1.79163591e+01 1.02301661e+01]
 [2.87089038e+04 3.26407700e+03]]


### Verify analysis with the module Py-DP

This module is directed to differential privacy methods. The analysis is done with simple sums and means of the algorithm.

In [24]:
#para validação dos dados
from pydp.algorithms.laplacian import BoundedSum

dp_sum = BoundedSum(epsilon=epsilon, dtype ='float')

pydp_count = []
pydp_count.append(dp_sum.quick_result(df['age']<30))
pydp_count.append(dp_sum.quick_result(df['age']<40)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']<50)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']<60)-pydp_count[-1])
pydp_count.append(dp_sum.quick_result(df['age']>=60))

print(original_count)
print(pydp_count)
print(percentage_error(pydp_count, dp_count))

[45000, 82299, 121543, 150373, 35595]
[45045.75923481208, 82293.24016077851, 121562.27663632695, 150409.43952712373, 35534.43097549287]
[0.06891604 0.07190514 0.04929008 0.09121399 0.18089524]


As done in the original analysis, we subtract the last value to each calculation to get the

In [25]:
from pydp.algorithms.laplacian import BoundedMean

dp_mean = BoundedMean(epsilon=epsilon, dtype='float')

pydp_past = []
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amount_annuity"].array), dp_mean.quick_result(sel1["past_avg_amount_annuity"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amt_application"].array), dp_mean.quick_result(sel1["past_avg_amt_application"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amt_credit"].array), dp_mean.quick_result(sel1["past_avg_amt_credit"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_approved"].array), dp_mean.quick_result(sel1["past_loans_approved"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_canceled"].array), dp_mean.quick_result(sel1["past_loans_canceled"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_refused"].array), dp_mean.quick_result(sel1["past_loans_refused"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_total"].array), dp_mean.quick_result(sel1["past_loans_total"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_unused"].array), dp_mean.quick_result(sel1["past_loans_unused"].array)])

print(original_past)
print(pydp_past)
print(percentage_error(pydp_past, dp_past))

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]
[[14585.571067632885, 13078.686293519997], [154260.9942015208, 141420.4768406931], [169564.80421035353, 153732.27816438593], [3.0683000398298423, 2.813627992051458], [0.8111888535806893, 1.1296026356149245], [0.8825808272349507, 0.8494406484794208], [4.833643513036995, 4.963616932751519], [0.07421131710868267, 0.06673703019725918]]
[[4.31578478e-01 2.14601584e+00]
 [4.76589909e-01 9.94578253e-01]
 [7.68046645e-01 4.81110818e+00]
 [1.01275791e+01 1.56926726e+03]
 [2.47583040e+03 1.01716946e+02]
 [7.33385369e+02 2.19119328e+02]
 [3.25271954e+01 4.39856372e+02]
 [1.77507795e+04 1.93349618e+03]]
