### Analysis planning

The analysis performed by delentture will be the same as performed in the first part of the project.

That being said, the columns starting with "past_" will be the key information to be passed, as well as the age

In [5]:
import pandas as pd
import numpy as np
import pydp as dp
from pydp.algorithms.numerical_mechanisms import NumericalMechanism
from pydp.algorithms.laplacian import BoundedSum, dp_mean.quick_result, Count, Max
# TODO para validacao

In [6]:
df = pd.read_csv("infringement_dataset_v2.csv")
df.shape

(307511, 46)

## Noise adding function

In [7]:
def add_laplace_noise(s, sensitivity, epsilon):
  return s + np.random.laplace(loc=0, scale=sensitivity/epsilon)

def percentage_error(orig, est):
  return ((orig-est) / orig) * 100

Small test of the noise adding

In [8]:
epsilon = 0.1 # common values 0.01, 0.01. 0.2, ln(2), ln(3)
sens = 1

orig_count = len(df.query('age > 40'))
dp_count = add_laplace_noise(len(df.query('age > 40')), 1, 0.01)
print(orig_count, dp_count, percentage_error(orig_count, dp_count))

171588 171532.30386768063 0.03245922344183054


The values are close by a factor of less than 100, which is small comparing to the value size (less than 0.01%)

### Original analysis

In [33]:
original_count = []
original_count.append(len(df.query('age<30')))
original_count.append(len(df.query('age<40')) - original_count[-1])
original_count.append(len(df.query('age<50')) - original_count[-1])
original_count.append(len(df.query('age<60')) - original_count[-1])
original_count.append(len(df.query('age>=60')))
print(original_count)

[45000, 82299, 121543, 150373, 35595]


In [10]:
sel1 = df[df["infringed"] == 1]
sel2 = df[df["infringed"] == 0]

original_past = []
original_past.append([sel2["past_avg_amount_annuity"].mean(), sel1["past_avg_amount_annuity"].mean()])
original_past.append([sel2["past_avg_amt_application"].mean(), sel1["past_avg_amt_application"].mean()])
original_past.append([sel2["past_avg_amt_credit"].mean(), sel1["past_avg_amt_credit"].mean()])
original_past.append([sel2["past_loans_approved"].mean(), sel1["past_loans_approved"].mean()])
original_past.append([sel2["past_loans_canceled"].mean(), sel1["past_loans_canceled"].mean()])
original_past.append([sel2["past_loans_refused"].mean(), sel1["past_loans_refused"].mean()])
original_past.append([sel2["past_loans_total"].mean(), sel1["past_loans_total"].mean()])
original_past.append([sel2["past_loans_unused"].mean(), sel1["past_loans_unused"].mean()])
print(original_past)

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]


### With differential privacy

Being aware that subtracting the previous dp.count brings more inaccuracy for the final result, but this is donw this way beacause the utilitary of the dataset does not have access to the variable original_count

In [34]:
dp_count = []
dp_count.append(add_laplace_noise((len(df.query('age<30'))), sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<40')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<50')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age<60')) - dp_count[-1], sens, epsilon))
dp_count.append(add_laplace_noise(len(df.query('age>=60')), sens, epsilon))
print(original_count, dp_count)

[45000, 82299, 121543, 150373, 35595] [44986.77946627695, 82310.06993253824, 121534.28120165406, 150394.17855139478, 35595.201081959516]


In [12]:
dp_past = []
dp_past.append([add_laplace_noise(sel2["past_avg_amount_annuity"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amount_annuity"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_application"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_application"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_avg_amt_credit"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_avg_amt_credit"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_approved"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_approved"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_canceled"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_canceled"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_refused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_refused"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_total"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_total"].mean(), sens, epsilon)])
dp_past.append([add_laplace_noise(sel2["past_loans_unused"].mean(), sens, epsilon), \
    add_laplace_noise(sel1["past_loans_unused"].mean(), sens, epsilon)])

print(dp_past)

[[14639.932267931588, 13358.944777065191], [155009.4837807949, 142803.98892556576], [170881.31482799587, 161126.52743569264], [62.3862473244716, -8.033872659665038], [3.363615497322737, -5.982075324032382], [1.0967655067538924, -2.0661286978985065], [-11.520606430982795, 17.603624756953234], [7.6025624990495535, -0.7635390286498829]]


Pode-se depois fazer uma análise com o BoundedSum do python-dp. Comparar os valores obtidos com o do módulo.

In [29]:
np.sum(np.logical_xor(df['age']<60,df['age']<50))

68074

In [35]:
dp_sum = BoundedSum(epsilon=epsilon, dtype ='float')

pypd_count = []
pypd_count.append(dp_sum.quick_result(df['age']<30))
pypd_count.append(dp_sum.quick_result(df['age']<40)-pypd_count[-1])
pypd_count.append(dp_sum.quick_result(df['age']<50)-pypd_count[-1])
pypd_count.append(dp_sum.quick_result(df['age']<60)-pypd_count[-1])
pypd_count.append(dp_sum.quick_result(df['age']>=60))
print(original_count, pypd_count)



[45000, 82299, 121543, 150373, 35595] [45003.82520068536, 82306.20886479787, 121542.84971525826, 150307.32923771438, 35609.85957974577]


In [49]:
from pydp.algorithms.laplacian import BoundedMean
dp_mean = BoundedMean(epsilon=epsilon, dtype='float')


pydp_past = []
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amount_annuity"].array), dp_mean.quick_result(sel1["past_avg_amount_annuity"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amt_application"].array), dp_mean.quick_result(sel1["past_avg_amt_application"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_avg_amt_credit"].array), dp_mean.quick_result(sel1["past_avg_amt_credit"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_approved"].array), dp_mean.quick_result(sel1["past_loans_approved"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_canceled"].array), dp_mean.quick_result(sel1["past_loans_canceled"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_refused"].array), dp_mean.quick_result(sel1["past_loans_refused"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_total"].array), dp_mean.quick_result(sel1["past_loans_total"].array)])
pydp_past.append([dp_mean.quick_result(sel2["past_loans_unused"].array), dp_mean.quick_result(sel1["past_loans_unused"].array)])
print(original_past, pydp_past)

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]] [[14576.119069246626, 13042.658809747103], [154359.154972432, 137157.891423072], [169366.3081199173, 152827.50662357296], [3.0665311194849307, 2.7984417739114287], [0.8003196254944829, 1.1379607863438839], [0.8838155607576779, 0.8291510063072798], [4.837904119805994, 4.990923247969073], [0.0751052817944261, 0.06885136800584701]]


In [47]:
print(sel2["past_avg_amount_annuity"].array)
dp_mean.quick_result(sel2["past_avg_amount_annuity"].array)

<PandasArray>
[          56553.99,            5357.25,          23651.175,
          12278.805,        15839.69625, 10051.412142857142,
           27463.41,          18303.195,           7894.155,
           12806.55,
 ...
 29227.344230769228,            6087.42,            6748.83,
        10466.92125,        37444.03875,         17133.9525,
            6605.91,          10074.465,           4770.405,
       20775.391875]
Length: 282686, dtype: float64


14560.298518283875