# CS211: Data Privacy - Final Project
## Haoyuan Pang, Chongqing Gao, Ke Tian

In [6]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from datetime import datetime

from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def gaussian_mech_RDP_vec(vec, sensitivity, alpha, epsilon):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon))
    return [v + np.random.normal(loc=0, scale=sigma) for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# compas_scores_raw = pd.read_csv('https://raw.github.com/g627444300/cs211-final_project/main/compas-scores-raw.csv')

adult = pd.read_csv('https://raw.github.com/g627444300/cs211-final_project/main/cox-violent-parsed.csv')

# cox_violent_parsed_filt = pd.read_csv('https://raw.github.com/g627444300/cs211-final_project/main/cox-violent-parsed_filt.csv')

# df = pd.read_csv('https://raw.github.com/g627444300/cs211-final_project/main/propublica_data_for_fairml.csv')

In [7]:
print (type(adult))
adult.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event
0,1.0,miguel hernandez,miguel,hernandez,14/08/2013,Male,18/04/1947,69,Greater than 45,Other,...,Risk of Violence,1,Low,14/08/2013,07/07/2014,14/07/2014,0,0,327,0
1,2.0,miguel hernandez,miguel,hernandez,14/08/2013,Male,18/04/1947,69,Greater than 45,Other,...,Risk of Violence,1,Low,14/08/2013,07/07/2014,14/07/2014,0,334,961,0
2,3.0,michael ryan,michael,ryan,31/12/2014,Male,06/02/1985,31,25 - 45,Caucasian,...,Risk of Violence,2,Low,31/12/2014,30/12/2014,03/01/2015,0,3,457,0
3,4.0,kevon dixon,kevon,dixon,27/01/2013,Male,22/01/1982,34,25 - 45,African-American,...,Risk of Violence,1,Low,27/01/2013,26/01/2013,05/02/2013,0,9,159,1
4,5.0,ed philo,ed,philo,14/04/2013,Male,14/05/1991,24,Less than 25,African-American,...,Risk of Violence,3,Low,14/04/2013,16/06/2013,16/06/2013,4,0,63,0


In [68]:
# print(adult.shape)
# display(adult.columns)

In [8]:
epsilon = 1.0

In [9]:
def dob_sum():
    return np.sum(adult['dob'].value_counts() == 1) ## uniquely identified by their age
dob_sum()

3240

In [10]:
def dob_hist():
    return  adult['dob'].value_counts()
dob_hist()

21/11/1991    21
10/11/1988    17
28/04/1992    15
13/08/1994    15
27/04/1989    14
              ..
28/07/1962     1
14/01/1963     1
22/01/1966     1
06/10/1964     1
17/09/1980     1
Name: dob, Length: 7485, dtype: int64

In [11]:
def dp_dob_hist(epsilon):
    return dob_hist().apply(lambda x: laplace_mech(x,1,epsilon))  ## total privacy cost is epsilon, by parallel composition
dp_dob_hist(epsilon)

21/11/1991    21.302139
10/11/1988    20.833655
28/04/1992    15.478421
13/08/1994    15.203788
27/04/1989    14.011664
                ...    
28/07/1962     1.182656
14/01/1963    -0.173085
22/01/1966     2.482704
06/10/1964     1.429896
17/09/1980    -2.635249
Name: dob, Length: 7485, dtype: float64

In [14]:
# def dp_crosstab_education_sex(epsilon):
#     ct = pd.crosstab(adult['dob'], adult['race'])
#     return ct.applymap(lambda x: laplace_mech(x, 1, epsilon))

# dp_crosstab_education_sex(1.0)

print (pct_error(dob_hist(),dp_dob_hist(epsilon)))   #percentage error

21/11/1991      2.929169
10/11/1988      2.749373
28/04/1992     12.984403
13/08/1994     14.371329
27/04/1989      1.022228
                 ...    
28/07/1962     35.810052
14/01/1963    278.729076
22/01/1966    182.181547
06/10/1964     47.748760
17/09/1980    288.419303
Name: dob, Length: 7485, dtype: float64


# # age + laplace

In [17]:
def age_hist():
    return  adult['age'].value_counts()
age_hist() 
# age + laplace
def dp_age_hist(epsilon):
    return age_hist().apply(lambda x: laplace_mech(x,1,epsilon))  ## total privacy cost is epsilon, by parallel composition
dp_age_hist(epsilon)

22    911.711218
26    911.364596
24    912.592541
21    899.356342
25    891.871366
         ...    
78      1.451481
83      2.612226
96      3.220700
80      0.816031
79      1.624139
Name: age, Length: 65, dtype: float64

In [47]:
# age_clipped + laplace
b=90 # age up to 90

real_sum = adult['age'].sum()
def clip_sum():
    clipped_sum= adult['age'].clip(lower=0, upper=b).sum()
    dp_clipped_sum =  age_hist().apply(lambda x: laplace_mech(x,1,epsilon)) #sensitivity = 0.1
    return (dp_clipped_sum)
clip_sum()

22    915.057285
26    906.636384
24    909.813823
21    898.002081
25    890.672330
         ...    
78      1.262864
83      3.378997
96      2.380283
80      1.125630
79     -0.386954
Name: age, Length: 65, dtype: float64

In [48]:
print (pct_error(age_hist(),clip_sum()))   #percentage error

22      0.031469
26      0.089396
24      0.040686
21      0.136537
25      0.275431
         ...    
78     18.052080
83     83.396201
96      3.060180
80    153.785961
79     44.080214
Name: age, Length: 65, dtype: float64
