In [12]:
import pandas as pd
import numpy as np
import random

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

old_data = pd.read_csv("anes_timeseries_2020_csv_20220210.csv",usecols=["V201510","V201600","V201029","V201033","V201507x"])
data = pd.read_csv("anes_timeseries_2020_csv_20220210.csv",usecols=["V201510","V201600","V201029","V201033","V201507x"])

In [2]:
def preprocc(data):

    #transforming highest level of education into a y/n

    data["college"] = np.where(data["V201510"]>=4,1,0)

    #binning ages into: 18-25, 26-34, 35-46, 47-65, 65-79, 80+, and drop NR values

    data = data.drop(data[data['V201507x'] == -9].index)
    data["age_bins"]=pd.cut(x=data["V201507x"],bins=[17,25,34,46,65,79,81],labels=[1,2,3,4,5,6])

    #sex is as it is, 1=M, 2=F, drop NR values

    data = data.drop(data[data['V201600'] == -9].index)
    data = data.rename(columns={"V201600":"sex"})

    #V201029 contains who the person voted for, V201033 contains who they plan to vote for
    #since we are only concerned about modelling a two-way fight, we discard rows where neither of the 2 values are 1/2 (Biden/Trump)

    biden_condlist, biden_cholist = [data["V201029"]==1, data["V201033"]==1], [1,1]
    trump_condlist, trump_cholist = [data["V201029"]==2, data["V201033"]==2], [1,1]
    data["vote_biden"] = np.select(biden_condlist, biden_cholist,default=0)
    data["vote_trump"] = np.select(trump_condlist, trump_cholist,default=0)
    #having obtained voter choices, we map 1 to Biden and 2 to Trump
    voter_choice, vote = [data["vote_biden"]==1,data["vote_trump"]==1], [1,2]
    data["vote"] = np.select(voter_choice,vote,default=0)
    #removing votes not cast for Biden or Trump
    data = data.drop(data[data['vote'] == 0].index)

    #removing excess columns
    data = data.drop(["V201510","V201507x","V201029","V201033","vote_biden","vote_trump"],axis=1)

    return data

In [3]:
new_data = preprocc(data)
working_data = new_data

In [4]:
sexvals = [1,2]
colvals = [0,1]
agevals = [1,2,3,4,5,6]
agebins = ["18-25","26-34","35-46","47-65","65-79","80+"]

dp_distr = []

for i in sexvals:
    for j in colvals:
        for k in agevals:
            data_strata = working_data[(working_data["sex"]==i) & (working_data["college"]==j) & (working_data["age_bins"]==k)]
            num_strata = data_strata.shape[0]
            counts = np.array(data_strata.vote.value_counts())
            dp_counts = [laplace_mech(c, 1, 1) for c in counts]
            dp_probs = dp_counts/np.sum(dp_counts)

            dp_distr.append([i,j,agebins[k-1],dp_probs,num_strata])

In [5]:
def generate_equallylikely(n):
    output = []
    count = int(n/(len(sexvals)*len(colvals)*len(agevals)))

    random.shuffle(dp_distr)

    for i in dp_distr[:-1]:
        random_votes_stra = np.random.choice([1,2],count,p=i[3])
        for j in random_votes_stra:
            output.append([i[0],i[1],i[2],j])
    

    for i in [dp_distr[-1]]:
        new_count = n-len(output)
        random_votes_stra = np.random.choice([1,2],new_count,p=i[3])
        for j in random_votes_stra:
            output.append([i[0],i[1],i[2],j])
    
    random.shuffle(output)
    return pd.DataFrame(np.array(output), columns = ["sex","college","age","vote"])

In [6]:
def generate_originalproportions(n):
    output = []

    random.shuffle(dp_distr)
    total = np.array(dp_distr).T[4].sum()
    print(total)

    for i in dp_distr[:-1]:
        count = int(n*i[4]/total)
        print(count,n,i[4],total)
        random_votes_stra = np.random.choice([1,2],count,p=i[3])
        for j in random_votes_stra:
            output.append([i[0],i[1],i[2],j])
    
    for i in [dp_distr[-1]]:
        new_count = n-len(output)
        #print(new_count,n,i[3],total)
        random_votes_stra = np.random.choice([1,2],new_count,p=i[3])
        for j in random_votes_stra:
            output.append([i[0],i[1],i[2],j])

    random.shuffle(output)
    return pd.DataFrame(np.array(output), columns = ["sex","college","age","vote"])

In [7]:
res_ogp = generate_originalproportions(50000)
res_eql = generate_equallylikely(50000)

  total = np.array(dp_distr).T[4].sum()


6893
2089 50000 288 6893
696 50000 96 6893
3191 50000 440 6893
1929 50000 266 6893
863 50000 119 6893
2451 50000 338 6893
3271 50000 451 6893
906 50000 125 6893
710 50000 98 6893
3460 50000 477 6893
5759 50000 794 6893
486 50000 67 6893
3358 50000 463 6893
1581 50000 218 6893
616 50000 85 6893
1088 50000 150 6893
3764 50000 519 6893
935 50000 129 6893
2727 50000 376 6893
1639 50000 226 6893
790 50000 109 6893
456 50000 63 6893
4613 50000 636 6893


In [13]:
old_data.head(5)

Unnamed: 0,V201029,V201033,V201507x,V201510,V201600
0,-1,2,46,6,1
1,-1,3,37,3,2
2,-1,1,40,2,2
3,-1,1,41,4,1
4,-1,2,72,8,1


In [43]:
working_data.head(5)
synth_data = []

In [45]:
#sex=2: female, college=1: college-educated, age_bins=1: age between 18-25

data_strata = working_data[(working_data["sex"]==2) & (working_data["college"]==1) & (working_data["age_bins"]==1)]
num_strata = data_strata.shape[0]

counts = np.array(data_strata.vote.value_counts())
dp_counts = [laplace_mech(c, 1, 1) for c in counts]
dp_probs = dp_counts/np.sum(dp_counts)


print(counts)
print(dp_counts)
print(dp_probs)

print("Synthesizing 5 values..")
for i in range(5):
    synth_data.append((2,1,1,np.random.choice([1,2],1,p=dp_probs)[0]))

print("(sex, college, age_bins, vote)")
for i in range(5):
    print(synth_data[i])

[73 25]
[75.29800596105547, 25.507619171088553]
[0.74696234 0.25303766]
Synthesizing 5 values..
(sex, college, age_bins, vote)
(2, 1, 1, 1)
(2, 1, 1, 1)
(2, 1, 1, 1)
(2, 1, 1, 2)
(2, 1, 1, 1)
