In [161]:
# Final Project
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# SOURCED FROM TEXTBOOK 'VARIANTS OF DIFFERENTIAL PRIVACY' SECTION - adjusted from vector format
def gaussian_mech_RDP(v, sensitivity, alpha, epsilon_bar):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon_bar))
    return v + np.random.normal(loc=0, scale=sigma)

def convert_RDP_ED(alpha, epsilon_bar, delta):
    return epsilon_bar + np.log(1 / delta) / (alpha - 1) # From textbook

df = pd.read_csv('survey.csv')


['Female' 'M' 'Male' 'male' 'female' 'm' 'Male-ish' 'maile' 'Trans-female'
 'Cis Female' 'F' 'something kinda male?' 'Cis Male' 'Woman' 'f' 'Mal'
 'Male (CIS)' 'queer/she/they' 'non-binary' 'Femake' 'woman' 'Make' 'Nah'
 'All' 'Enby' 'fluid' 'Genderqueer' 'Female ' 'Androgyne' 'Agender'
 'cis-female/femme' 'Guy (-ish) ^_^' 'male leaning androgynous' 'Male '
 'Man' 'Trans woman' 'msle' 'Neuter' 'Female (trans)' 'queer'
 'Female (cis)' 'Mail' 'cis male' 'A little about you' 'Malr' 'p' 'femail'
 'Cis Man' 'ostensibly male, unsure what that really means']


In [162]:
def gender_data_cleaning(df):
    # Data cleaning for columns we need
    unique_genders = df['Gender'].unique()

    m = "Male"
    f = "Female"
    o = "Other"
    map_to = [f, m, m, m, f, m, m, m, f, f, f, m, m, f, f, m, m, o, o, f, f, m, o, o, o, o, o, f, o, o, f, m, m, m, m, f, m, o, f, o, f, m, m, o, m, o, f, m, m]
    assert len(unique_genders) == len(map_to), "Mapping mismatch in gender data cleaning"

    # Zip together the mapping
    gender_mapping = dict(zip(unique_genders, map_to))

    # Change genders
    df['Gender'] = df['Gender'].map(gender_mapping)
    
    return df

def age_data_cleaning(df):
    # Will drop any columns where age is less than 18 or greater than 90 as this should be data on the tech industry
    df = df[(df['Age'] >= 18) & (df["Age"] <= 90)]
    return df

def yes_no_to_true_false(df):
    # Will convert yes/no answers to true or false for the columns we care about
    yes_no_mapping = {"Yes" : True, "No" : False}
    df['family_history'] = df['family_history'].map(yes_no_mapping)
    df['treatment'] = df['treatment'].map(yes_no_mapping)
    return df

def drop_cols(df):
    df = df.drop(['state', 'comments'], axis=1)
    return df

def fill_na(df, col, val):
    df[col] = df[col].fillna(val)
    return df
    
df = gender_data_cleaning(df)
df = age_data_cleaning(df)
df = yes_no_to_true_false(df)
df = drop_cols(df)
df = fill_na(df, 'work_interfere', 'Never')




In [163]:
# General thoughts for project outline
# Maybe lets just focus on releasing what percentage of individuals have seek treatment for mental illness. Which is just 2 counting queries so it would be easy? 
# Maybe based on family history?

In [164]:
df[df['treatment'] == True]

Unnamed: 0,Timestamp,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,2014-08-27 11:29:31,37,Female,United States,,False,True,Often,6-25,No,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
3,2014-08-27 11:29:46,31,Male,United Kingdom,,True,True,Often,26-100,No,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
6,2014-08-27 11:31:50,35,Female,United States,,True,True,Sometimes,1-5,Yes,...,No,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No
8,2014-08-27 11:32:39,42,Female,United States,,True,True,Sometimes,100-500,No,...,No,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No
10,2014-08-27 11:32:44,31,Male,United States,,False,True,Sometimes,6-25,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,No,No,Don't know,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1252,2015-08-20 16:52:09,29,Male,United States,No,True,True,Sometimes,100-500,Yes,...,Yes,Don't know,Yes,No,Some of them,No,No,Maybe,No,No
1254,2015-09-12 11:17:21,26,Male,United Kingdom,No,False,True,Never,26-100,No,...,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No
1255,2015-09-26 01:07:35,32,Male,United States,No,True,True,Often,26-100,Yes,...,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No
1256,2015-11-07 12:36:58,34,Male,United States,No,True,True,Sometimes,More than 1000,No,...,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No,No,No


In [165]:
# Epsilon DP - Seek Treatment
error_total = 0
for _ in range(100):
    true_num = df['treatment'].sum()
    total_people = df['treatment'].count()

    epsilon = 0.1
    noisy_num = laplace_mech(true_num, 1, epsilon/2)
    noisy_total = laplace_mech(total_people, 1, epsilon/2)

    error_total += pct_error(true_num / total_people, noisy_num / noisy_total)

print(error_total / 100)

print(f"True: {true_num / total_people} \nNoisy: {noisy_num / noisy_total}")
print(f"True: {true_num}, Noisy: {noisy_num}")
print(f"Error: {pct_error(true_num / total_people, noisy_num / noisy_total)}")

3.7030403491357076
True: 0.5051958433253397 
Noisy: 0.4894457441117629
True: 632, Noisy: 627.4774062741036
Error: 3.1176224867380737


In [166]:
# Epsilon DP - Seek Treatment with family history
total_error = 0
for _ in range(100):
    true_num = df[(df['treatment'] == True) & (df['family_history'] == True)]['treatment'].count()
    total_people = df[(df['family_history'] == True)]['treatment'].count()

    epsilon = 0.1
    noisy_num = laplace_mech(true_num, 1, epsilon/2)
    noisy_total = laplace_mech(total_people, 1, epsilon/2)

    total_error += pct_error(true_num / total_people, noisy_num / noisy_total)

print(total_error / 100)

print(f"True: {true_num / total_people} \nNoisy: {noisy_num / noisy_total}")
print(f"True: {true_num}, Noisy: {noisy_num}")
print(f"")

6.984884840478381
True: 0.7402862985685071 
Noisy: 0.6591771669476804
True: 362, Noisy: 336.2876629259282



In [167]:
# Epsilon Delta DP - Seek treatment
error_total = 0
for _ in range(100):
    true_num = df['treatment'].sum()
    total_people = df['treatment'].count()

    epsilon = 0.1
    noisy_num = gaussian_mech(true_num, 1, epsilon/2, 1e-5)
    noisy_total = gaussian_mech(total_people, 1, epsilon/2, 1e-5)

    error_total += pct_error(true_num / total_people, noisy_num / noisy_total)

print(error_total / 100)


print(f"True: {true_num / total_people} \nNoisy: {noisy_num / noisy_total}")
print(f"True: {true_num}, Noisy: {noisy_num}")
print(f"Error: {pct_error(true_num / total_people, noisy_num / noisy_total)}")

12.718576750744353
True: 0.5051958433253397 
Noisy: 0.6071117961500977
True: 632, Noisy: 708.5383249639176
Error: 20.17355332021712


In [168]:
# Renyi DP


In [169]:
# Synthetic Data DP