In [228]:
# Final Project
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# SOURCED FROM TEXTBOOK 'VARIANTS OF DIFFERENTIAL PRIVACY' SECTION - adjusted from vector format
def gaussian_mech_RDP(v, sensitivity, alpha, epsilon_bar):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon_bar))
    return v + np.random.normal(loc=0, scale=sigma)

def convert_RDP_ED(alpha, epsilon_bar, delta):
    return epsilon_bar + np.log(1 / delta) / (alpha - 1) # From textbook

df = pd.read_csv('survey.csv')


In [229]:
def gender_data_cleaning(df):
    # Data cleaning for columns we need
    unique_genders = df['Gender'].unique()

    m = "Male"
    f = "Female"
    o = "Other"
    map_to = [f, m, m, m, f, m, m, m, f, f, f, m, m, f, f, m, m, o, o, f, f, m, o, o, o, o, o, f, o, o, f, m, m, m, m, f, m, o, f, o, f, m, m, o, m, o, f, m, m]
    assert len(unique_genders) == len(map_to), "Mapping mismatch in gender data cleaning"

    # Zip together the mapping
    gender_mapping = dict(zip(unique_genders, map_to))

    # Change genders
    df['Gender'] = df['Gender'].map(gender_mapping)
    
    return df

def age_data_cleaning(df):
    # Will drop any columns where age is less than 18 or greater than 90 as this should be data on the tech industry
    df = df[(df['Age'] >= 18) & (df["Age"] <= 90)]
    return df

def yes_no_to_true_false(df, column):
    # Will convert yes/no answers to true or false for the columns we care about
    yes_no_mapping = {"Yes" : True, "No" : False}
    df[column] = df[column].map(yes_no_mapping)
    return df

def drop_cols(df):
    df = df.drop(['state', 'comments'], axis=1)
    return df

def fill_na(df, col, val):
    df[col] = df[col].fillna(val)
    return df

def maybe_to_yes(df, col):
    maybe_mapping = {"Yes": "Yes", "Maybe" : "Yes", "No" : "No"}
    df[col] = df[col].map(maybe_mapping)
    return df
    
df = gender_data_cleaning(df)
df = age_data_cleaning(df)
df = drop_cols(df)

# Fill na
df = fill_na(df, 'work_interfere', 'Never')
df = fill_na(df, 'self_employed', 'No')

# Map maybe for consequence to yes
df = maybe_to_yes(df, 'mental_health_consequence')


# Convert yes / no to true / false
df = yes_no_to_true_false(df, 'treatment')
df = yes_no_to_true_false(df, 'family_history')
df = yes_no_to_true_false(df, 'self_employed')
df = yes_no_to_true_false(df, 'remote_work')
df = yes_no_to_true_false(df, 'tech_company')
df = yes_no_to_true_false(df, 'mental_health_consequence')
df = yes_no_to_true_false(df, 'obs_consequence')


In [230]:
# General thoughts for project outline
# Maybe lets just focus on releasing what percentage of individuals have seek treatment for mental illness. Which is just 2 counting queries so it would be easy? 
# Maybe based on family history?

In [231]:
# Create workload
colsToCompare = ['self_employed', 'family_history', 'remote_work', 'tech_company', 'mental_health_consequence', 'obs_consequence']

# Make query
def association_query(df, col1, col2):
    return len(df[(df[col1] == True) & (df[col2] == True)])

workload = []
for col in colsToCompare:
    workload.append(('treatment', col))

real_answers = [association_query(df, col1, col2) for (col1, col2) in workload]

print(workload)
print(real_answers)


[('treatment', 'self_employed'), ('treatment', 'family_history'), ('treatment', 'remote_work'), ('treatment', 'tech_company'), ('treatment', 'mental_health_consequence'), ('treatment', 'obs_consequence')]
[75, 362, 195, 510, 422, 125]


In [242]:
# Epsilon DP
def workload_laplace(workload, epsilon):
    # Determine individual i's 
    epsilon_i = epsilon / len(workload)
    queries = [association_query(df, col1, col2) for (col1, col2) in workload]
    return [laplace_mech(query, sensitivity=1, epsilon=epsilon_i) for query in queries]

workload_laplace(workload, 0.1)
    

[54.29376765181739,
 326.9781693493864,
 190.5037374230667,
 522.5326223400823,
 379.7297031659281,
 67.25580080073284]

In [247]:
errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_answers, workload_laplace(workload, 0.1))]
print('Average absolute error:', np.mean(errors))

Average absolute error: 76.38737710196779


In [244]:
# Epsilon Delta DP
def workload_laplace_vec(workload, epsilon):
    queries = [association_query(df, col1, col2) for (col1, col2) in workload]
    
    # L1 global sensitivity is equal to adding all the sensitivities up 
    # Each query has a sensitivity of 1 so the L1 sensitivity is equal to len(queries)
    sens = len(queries)
    return laplace_mech_vec(queries, sensitivity=sens, epsilon=epsilon)

workload_laplace_vec(workload, 0.1)

[133.12888148748098,
 350.7386573682376,
 207.7164229051883,
 487.98906489353146,
 414.5467567460243,
 142.63891845362744]

In [248]:
errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_answers, workload_laplace_vec(workload, 0.1))]
print('Average absolute error:', np.mean(errors))

Average absolute error: 43.17463414492229


In [249]:
def workload_gaussian_vec(workload, epsilon, delta):
    queries = [association_query(df, col1, col2) for (col1, col2) in workload]

    # L2 global sensitivity should be used here 
    # L2 sens = squares of individual sensitivities added up then square rooted - each sensitivity is 1 because this is a counting query
    # L2 sens = sqrt(len(queries))
    return gaussian_mech_vec(queries, sensitivity=np.sqrt(len(queries)), epsilon=epsilon, delta=delta)

workload_gaussian_vec(workload, 0.1, 1e-5)

[28.641905527861027,
 494.2590118303943,
 462.6170545448808,
 556.1634949396387,
 384.9666926083772,
 302.2073776311055]

In [256]:
errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_answers, workload_gaussian_vec(workload, 0.1, 1e-5))]
print('Average absolute error:', np.mean(errors))

Average absolute error: 88.96677667979709


In [257]:
# Renyi DP
# SOURCED FROM TEXTBOOK 'VARIANTS OF DIFFERENTIAL PRIVACY' SECTION
def gaussian_mech_RDP_vec(vec, sensitivity, alpha, epsilon_bar):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon_bar))
    return [v + np.random.normal(loc=0, scale=sigma) for v in vec]

def workload_gaussian_vec_RDP(workload, alpha, epsilon_bar):
    queries = [association_query(df, col1, col2) for (col1, col2) in workload]

    # L2 global sensitivity should be used here as well
    # L2 sens = squares of individual sensitivities added up then square rooted - each sensitivity is 1 because this is a counting query
    # L2 sens = sqrt(len(queries))
    return gaussian_mech_RDP_vec(queries, sensitivity=np.sqrt(len(queries)), alpha=alpha, epsilon_bar=epsilon_bar)

workload_gaussian_vec_RDP(workload, 5, 0.1)


[77.79243621353002,
 378.2934531682524,
 171.38686673951713,
 499.6250885254848,
 413.4182496551203,
 135.91234258716474]

In [260]:
errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_answers, workload_gaussian_vec_RDP(workload, 5, 0.1))]
print('Average absolute error:', np.mean(errors))

Average absolute error: 10.27736736205218


In [237]:
# Synthetic Data DP