In [189]:
# Final Project
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# SOURCED FROM TEXTBOOK 'VARIANTS OF DIFFERENTIAL PRIVACY' SECTION - adjusted from vector format
def gaussian_mech_RDP(v, sensitivity, alpha, epsilon_bar):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon_bar))
    return v + np.random.normal(loc=0, scale=sigma)

def convert_RDP_ED(alpha, epsilon_bar, delta):
    return epsilon_bar + np.log(1 / delta) / (alpha - 1) # From textbook

df = pd.read_csv('survey.csv')


In [190]:
def gender_data_cleaning(df):
    # Data cleaning for columns we need
    unique_genders = df['Gender'].unique()

    m = "Male"
    f = "Female"
    o = "Other"
    map_to = [f, m, m, m, f, m, m, m, f, f, f, m, m, f, f, m, m, o, o, f, f, m, o, o, o, o, o, f, o, o, f, m, m, m, m, f, m, o, f, o, f, m, m, o, m, o, f, m, m]
    assert len(unique_genders) == len(map_to), "Mapping mismatch in gender data cleaning"

    # Zip together the mapping
    gender_mapping = dict(zip(unique_genders, map_to))

    # Change genders
    df['Gender'] = df['Gender'].map(gender_mapping)
    
    return df

def age_data_cleaning(df):
    # Will drop any columns where age is less than 18 or greater than 90 as this should be data on the tech industry
    df = df[(df['Age'] >= 18) & (df["Age"] <= 90)]
    return df

def yes_no_to_true_false(df, column):
    # Will convert yes/no answers to true or false for the columns we care about
    yes_no_mapping = {"Yes" : True, "No" : False}
    df[column] = df[column].map(yes_no_mapping)
    return df

def drop_cols(df):
    df = df.drop(['state', 'comments'], axis=1)
    return df

def fill_na(df, col, val):
    df[col] = df[col].fillna(val)
    return df

def maybe_to_yes(df, col):
    maybe_mapping = {"Yes": "Yes", "Maybe" : "Yes", "No" : "No"}
    df[col] = df[col].map(maybe_mapping)
    return df
    
df = gender_data_cleaning(df)
df = age_data_cleaning(df)
df = drop_cols(df)

# Fill na
df = fill_na(df, 'work_interfere', 'Never')
df = fill_na(df, 'self_employed', 'No')

# Map maybe for consequence to yes
df = maybe_to_yes(df, 'mental_health_consequence')


# Convert yes / no to true / false
df = yes_no_to_true_false(df, 'treatment')
df = yes_no_to_true_false(df, 'family_history')
df = yes_no_to_true_false(df, 'self_employed')
df = yes_no_to_true_false(df, 'remote_work')
df = yes_no_to_true_false(df, 'tech_company')
df = yes_no_to_true_false(df, 'mental_health_consequence')
df = yes_no_to_true_false(df, 'obs_consequence')


In [191]:
# General thoughts for project outline
# Maybe lets just focus on releasing what percentage of individuals have seek treatment for mental illness. Which is just 2 counting queries so it would be easy? 
# Maybe based on family history?

Index(['Timestamp', 'Age', 'Gender', 'Country', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [206]:
# Create workload
colsToCompare = ['self_employed', 'family_history', 'remote_work', 'tech_company', 'mental_health_consequence', 'obs_consequence']

# Make query
def association_query(df, col1, col2):
    return len(df[(df[col1] == True) & (df[col2] == True)])

workload = []
for col in colsToCompare:
    workload.append(association_query(df, 'treatment', col))


In [193]:
# Epsilon DP - Seek Treatment
error_total = 0
for _ in range(100):
    true_num = df['treatment'].sum()
    total_people = df['treatment'].count()

    epsilon = 0.1
    noisy_num = laplace_mech(true_num, 1, epsilon/2)
    noisy_total = laplace_mech(total_people, 1, epsilon/2)

    error_total += pct_error(true_num / total_people, noisy_num / noisy_total)

print(error_total / 100)

print(f"True: {true_num / total_people} \nNoisy: {noisy_num / noisy_total}")
print(f"True: {true_num}, Noisy: {noisy_num}")
print(f"Error: {pct_error(true_num / total_people, noisy_num / noisy_total)}")

3.0840027610312406
True: 0.5051958433253397 
Noisy: 0.4678861899274186
True: 632, Noisy: 622.9217543599367
Error: 7.3851861393669855


In [194]:
# Epsilon DP - Seek Treatment with family history
total_error = 0
for _ in range(100):
    true_num = df[(df['treatment'] == True) & (df['family_history'] == True)]['treatment'].count()
    total_people = df[(df['family_history'] == True)]['treatment'].count()

    epsilon = 0.1
    noisy_num = laplace_mech(true_num, 1, epsilon/2)
    noisy_total = laplace_mech(total_people, 1, epsilon/2)

    total_error += pct_error(true_num / total_people, noisy_num / noisy_total)

print(total_error / 100)

print(f"True: {true_num / total_people} \nNoisy: {noisy_num / noisy_total}")
print(f"True: {true_num}, Noisy: {noisy_num}")
print(f"")

6.285149221552242
True: 0.7402862985685071 
Noisy: 0.6245901707065193
True: 362, Noisy: 356.3327113742949



In [195]:
# Epsilon Delta DP - Seek treatment
error_total = 0
for _ in range(100):
    true_num = df['treatment'].sum()
    total_people = df['treatment'].count()

    epsilon = 0.1
    noisy_num = gaussian_mech(true_num, 1, epsilon/2, 1e-5)
    noisy_total = gaussian_mech(total_people, 1, epsilon/2, 1e-5)

    error_total += pct_error(true_num / total_people, noisy_num / noisy_total)

print(error_total / 100)


print(f"True: {true_num / total_people} \nNoisy: {noisy_num / noisy_total}")
print(f"True: {true_num}, Noisy: {noisy_num}")
print(f"Error: {pct_error(true_num / total_people, noisy_num / noisy_total)}")

12.181112133455024
True: 0.5051958433253397 
Noisy: 0.6628944772433413
True: 632, Noisy: 735.8012421284712
Error: 31.215346682186695


In [196]:
# Renyi DP


In [197]:
# Synthetic Data DP