This notebook preprocesses the raw ACS dataset to produce the preprocessed data embedded in the 'Counties' notebook.

In [1]:
import pymde
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
dataset = pymde.datasets.counties()

Mar 19 03:44:40 PM: Load cached dataset.


In [3]:
X = dataset.county_dataframe
y = dataset.voting_dataframe

In [4]:
X

Unnamed: 0,CountyId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001,Alabama,Autauga County,55036,26899,28137,2.7,75.4,18.9,0.3,...,0.6,1.3,2.5,25.8,24112,74.1,20.2,5.6,0.1,5.2
1,1003,Alabama,Baldwin County,203360,99527,103833,4.4,83.1,9.5,0.8,...,0.8,1.1,5.6,27.0,89527,80.7,12.9,6.3,0.1,5.5
2,1005,Alabama,Barbour County,26201,13976,12225,4.2,45.7,47.8,0.2,...,2.2,1.7,1.3,23.4,8878,74.1,19.1,6.5,0.3,12.4
3,1007,Alabama,Bibb County,22580,12251,10329,2.4,74.6,22.0,0.4,...,0.3,1.7,1.5,30.0,8171,76.0,17.4,6.3,0.3,8.2
4,1009,Alabama,Blount County,57667,28490,29177,9.0,87.4,1.5,0.3,...,0.4,0.4,2.1,35.0,21380,83.9,11.9,4.0,0.1,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,72145,Puerto Rico,Vega Baja Municipio,54754,26269,28485,96.7,3.1,0.1,0.0,...,1.4,0.6,0.9,31.6,14234,76.2,19.3,4.3,0.2,16.8
3216,72147,Puerto Rico,Vieques Municipio,8931,4351,4580,95.7,4.0,0.0,0.0,...,5.0,0.0,1.7,14.9,2927,40.7,40.9,18.4,0.0,12.8
3217,72149,Puerto Rico,Villalba Municipio,23659,11510,12149,99.7,0.2,0.1,0.0,...,2.1,0.0,2.8,28.4,6873,59.2,30.2,10.4,0.2,24.8
3218,72151,Puerto Rico,Yabucoa Municipio,35025,16984,18041,99.9,0.1,0.0,0.0,...,1.4,1.8,0.1,30.5,7878,62.7,30.9,6.3,0.0,25.4


In [5]:
totalpop_col_idx = 3

col_idxes_to_normalize = [4,5,31]

col_idxes_to_log = [totalpop_col_idx, 12,13,14,15,16]

col_idxes_to_div_by_100 = [idx for idx in range(6,11+1)]
col_idxes_to_div_by_100 += [idx for idx in range(17,30)]
col_idxes_to_div_by_100 += [idx for idx in range(32,36+1)]

In [6]:
col_names = X.columns

for col_idx in col_idxes_to_normalize + col_idxes_to_log:
    col_idx_name = col_names[col_idx]
    X[col_idx_name] = X[col_idx_name].astype(float)

for row_idx, row_obj in X.iterrows():
    totalpop_idx_name = col_names[totalpop_col_idx]
    totalpop = float(row_obj[totalpop_idx_name])
    
    for col_idx in col_idxes_to_normalize:
        col_idx_name = col_names[col_idx]
        col_val = float(row_obj[col_idx_name])
        X.at[row_idx, col_idx_name] = col_val / totalpop
    
    for col_idx in col_idxes_to_log:
        col_idx_name = col_names[col_idx]
        col_val = float(row_obj[col_idx_name])
        X.at[row_idx, col_idx_name] = np.log(col_val)
        
    for col_idx in col_idxes_to_div_by_100:
        col_idx_name = col_names[col_idx]
        col_val = float(row_obj[col_idx_name])
        X.at[row_idx, col_idx_name] = col_val / 100.0

In [7]:
for row_idx, row_obj in X.iterrows():
    X.at[row_idx, "MeanCommute"] = (X.at[row_idx, "MeanCommute"])/60.

In [8]:
y.rename(columns={"combined_fips":"CountyId"}, inplace=True)

In [9]:
Xy = X.merge(y, on="CountyId")

In [10]:
drop_cols = ["Unnamed: 0", "state_abbr", "county_name", "per_point_diff", "diff", "total_votes"]
Xy.drop(columns=drop_cols, inplace=True)

In [11]:
y_many_cols = ["votes_dem", "votes_gop", "per_dem", "per_gop", "CountyId", "State", "County"]
y_many = Xy[y_many_cols]
y_many = y_many.copy()

X = Xy.drop(columns=y_many_cols)

In [12]:
X = preprocessing.scale(X.to_numpy())

In [13]:
y = pd.DataFrame(y_many["votes_gop"] <= y_many["votes_dem"])
y = y.to_numpy(dtype=np.int32)
y = y.flatten()

In [14]:
y_prop = pd.DataFrame(y_many["per_dem"]).to_numpy(dtype=np.float64)[:,0]

In [15]:
np.save('democratic_fraction.npy', y_prop)
np.save('county_data.npy', X)