# 1a. Survey Cleaning - US 
Survey data from the US is from the 2019 ACS, obtained from Folktables (https://github.com/socialfoundations/folktables)

In [1]:
import json
import numpy as np
import pandas as pd
import geopandas as gpd
import json

In [2]:
HH_SURVEY_FNAME = '/data/mosaiks/surveys/us/household_2019.csv'
POPULATION_DENSITY_FNAME = '/data/mosaiks/surveys/us/population_density_by_puma.csv'
SHAPEFILE_FNAME = '/data/mosaiks/shapefiles/us_pumas/pumas.shp'
GROUPED_OUT_FNAME = '/data/mosaiks/replication/surveys/us/groundtruth_by_puma_2019.csv'

In [3]:
# Read in states metadata
with open('/data/mosaiks/surveys/us/us_state_to_abbrev.json', 'r') as infile:
    state_to_abbrev = json.load(infile)
    abbrev_to_state = dict(map(reversed, state_to_abbrev.items()))
with open('/data/mosaiks/surveys/us/us_state_to_code.json', 'r') as infile:
    state_to_code = json.load(infile)
    code_to_state = dict(map(reversed, state_to_code.items()))

In [4]:
def grouped_weighted_mean(df, agg_cols, feature_cols, weight_col):
    data = df.copy()
    for feature_col in feature_cols:
        data[feature_col] = data[feature_col]*data[weight_col]
    data = data.groupby(agg_cols, as_index=False).agg('sum')
    for feature_col in feature_cols:
        data[feature_col] = data[feature_col]/data[weight_col] 
    return data[agg_cols + feature_cols + [weight_col]]

In [5]:
# Read in population density data
print('Population density data')
print('------------------------')
pop_density = pd.read_csv(POPULATION_DENSITY_FNAME)[['state', 'pumanum', 'simpledensity', 'pumapop']]\
    .rename({'pumanum':'PUMA', 'simpledensity':'pop_density', 'state':'STATE', 'pumapop':'population'}, axis=1)
pop_density['rural'] = (pop_density['pop_density'] < 500).astype('int')
print('PUMAs with data: %i' % len(pop_density))

Population density data
------------------------
PUMAs with data: 2351


In [6]:
# Read in household data, filter to HH with non-0 weight and non-null income
print('Household data')
print('---------------')
hh = pd.read_csv(HH_SURVEY_FNAME)
print('Starting number of observations: %i' % len(hh))
hh = hh[hh['WGTP'] > 0]
print('Observations with >0 weight: %i' % len(hh))
hh = hh.dropna(subset=['FINCP'])
print('Observations with non-null target: %i' % len(hh))

# Get average by PUMA for HH variables
hh_grouped = grouped_weighted_mean(hh, ['PUMA', 'STATE'], ['FINCP'], 'WGTP')
print('Regions with data: %i' % len(hh_grouped))

Household data
---------------
Starting number of observations: 1534367
Observations with >0 weight: 1384937
Observations with non-null target: 834979
Regions with data: 2331


In [7]:
# Get area data from shapefile
shapefile = gpd.read_file(SHAPEFILE_FNAME)
shapefile = shapefile.to_crs({'proj':'cea'})
shapefile['Area'] = shapefile['geometry'].area/(10**6)
shapefile = shapefile[['PUMA', 'State', 'Area']]

  return _prepare_from_string(" ".join(pjargs))


In [9]:
# Merge data sources together
print('Merged data')
print('-----------')
puma_statistics = hh_grouped.merge(pop_density, on=['STATE', 'PUMA'], how='inner')
puma_statistics['State'] = puma_statistics['STATE'].apply(lambda x: abbrev_to_state[x])
puma_statistics = puma_statistics.merge(shapefile, on=['State', 'PUMA'], how='inner')
puma_statistics.to_csv(GROUPED_OUT_FNAME, index=False)
print('PUMAs with data: %i' % len(puma_statistics))

Merged data
-----------
PUMAs with data: 2331
