### Aggregating synthpop synthetic populations to tract level
1. read in individual csv files containing synthetic populations
2. combine into one large dataframe
3. group by tract to aggregate counts
4. compute new implied variable
5. export as csv to join to tract boundaries in gis

In [1]:
import pandas as pd
import csv
import numpy as np
# library for path names
import glob 
from functools import reduce
pd.options.display.max_columns = 40

In [2]:
# function to compiling dataframe from folder of individual csv files
def compile_df(pathfile_name):
    li = []
    all_files = glob.glob(pathfile_name)
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0, converters={'tract': lambda x: str(x)})
        li.append(df)
    return(pd.concat(li, axis=0, ignore_index=True))

### Aggregating household-level synthetic population

In [3]:
## creating household level variables/tables
households_meck_comp = compile_df("data_outputs/20190323_run/*households.csv")
households_meck_comp.to_csv("data_outputs/20190323_compiled/households_meck.csv")

# converting columns to correct data type
households_meck = pd.read_csv("data_outputs/20190323_compiled/households_meck.csv", 
                       header=0, dtype={'puma00':str, 'puma10':str, 'NP':int,
                                        'TYPE':int,'race_of_head':str,
                                        'hispanic_head':str, 'age_of_head':int,
                                        'hh_age_of_head':str, 'hh_cars':str,
       'hh_children':str, 'hh_income':str, 'hh_race_of_head':str, 'hh_size':str, 'hh_workers':str,
       'seniors':str, 'sf_detached':str, 'tenure_mover':str,'state':str, 'county':str,
       'tract':str, 'block group':str  
                       })
## dropping variables we are not using
households_var = households_meck.drop(columns=['Unnamed: 0.1', 'serialno', 'RT', 'puma00', 'puma10',
       'NP', 'TYPE', 'BLD', 'TEN', 'VEH', 'HINCP', 'MV', 'R18', 'R65',
       'race_of_head', 'hispanic_head', 'age_of_head', 'workers',
       'hh_age_of_head', 'hh_children', 'hh_race_of_head',
       'hh_size', 'hh_workers', 'seniors', 'sf_detached', 'tenure_mover',
       'cat_id', 'state', 'county', 'block group'])

## aggregating households by cars and tract
households_meck_tract_cars = households_var.groupby(["tract","hh_cars"]).count()
households_meck_tract_income = households_var.groupby(["tract","hh_income"]).count()
households_meck_tract_tot = households_var.groupby(["tract"]).count()

# resetting index and unstacking to make a wide data frame
house_cars = households_meck_tract_cars.reset_index().pivot(index='tract', columns='hh_cars', values ='Unnamed: 0').fillna(0)
house_income = households_meck_tract_income.reset_index().pivot(index='tract', columns='hh_income', values ='Unnamed: 0').fillna(0) 
house_tot = households_meck_tract_tot.drop(columns={'hh_cars','hh_income'}).rename(columns={'Unnamed: 0':'total'})

# imputed variables
# house_impute = households_var.groupby(["tract","hh_cars","hh_income"]).count()
# house_impute = house_impute.reset_index()
# nocar_lt30 = house_impute.loc[(house_impute['hh_cars'] =='none')&(house_impute['hh_income']=='lt30')]
# nocar_lt30 = nocar_lt30.drop(columns=['hh_cars','hh_income']).rename(columns={'Unnamed: 0':'nocar_lt30'})

# households_synth = house_cars.merge(house_income,how='outer', on="tract")
households = [house_tot,house_income,house_cars]
households_synth = reduce(lambda left,right: pd.merge(left,right,on="tract"),households)


##adding prefix to columns
households_synth.columns = ['s_' + str(col) for col in households_synth.columns]



### Aggregating person-level synthetic population

In [6]:
# person level data
persons_meck_comp = compile_df("data_outputs/20190323_run/*persons.csv")
persons_meck_comp.to_csv("data_outputs/20190323_compiled/persons_meck.csv")
persons_meck = pd.read_csv("data_outputs/20190323_compiled/persons_meck.csv",
                                 header=0, dtype = {'AGEP':int,
       'RELP':int, 'SEX':str,'HISP':str, 'RAC1P':str,
       'hispanic':str, 'person_age':str, 'person_sex':str, 'race':str,
       'state':str, 'county':str, 'tract':str, 'block group':str})

# # aggregating for total pop person level data
persons_var = persons_meck.drop(columns=['Unnamed: 0.1', 'serialno', 'SPORDER', 'puma00', 'puma10',
       'AGEP', 'JWTR', 'RELP', 'SCH', 'SCHL', 'SEX', 'WKHP', 'ESR', 'HISP',
       'PERNP', 'RAC1P',
       'cat_id', 'hh_id', 'state', 'county', 'block group'])
persons_tract_tot = persons_var.groupby(["tract"]).count()
persons_tract_age = persons_var.groupby(["tract","person_age"]).count()
persons_tract_sex = persons_var.groupby(["tract","person_sex"]).count()
persons_tract_race = persons_var.groupby(["tract","race"]).count()
persons_tract_hisp = persons_var.groupby(["tract","hispanic"]).count()


## resetting index and unstacking to make a wide data frame
persons_tract_tot = persons_tract_tot.drop(columns = ['hispanic','person_age','person_sex',
                                  'race']).rename(columns={'Unnamed: 0':'total'})
persons_tract_age = persons_tract_age.reset_index().pivot(index='tract',columns="person_age", values="Unnamed: 0").dropna(0)
persons_tract_sex = persons_tract_sex.reset_index().pivot(index='tract',columns="person_sex", values="Unnamed: 0").dropna(0)
persons_tract_race = persons_tract_race.reset_index().pivot(index='tract',columns="race", values="Unnamed: 0").dropna(0)
persons_tract_hisp = persons_tract_hisp.reset_index().pivot(index='tract',columns="hispanic",values="Unnamed: 0").dropna(0)

# Calculating imputed column
persons_impute = persons_var.groupby(["tract","person_sex","person_age"]).count()
persons_impute = persons_impute.reset_index()
women_above60 = persons_impute.loc[(persons_impute['person_sex'] =='female')&(persons_impute['person_age']=='above 60')]
women_above60 = women_above60.drop(columns=['person_sex','person_age','Unnamed: 0', 'hispanic']).rename(columns={'race':'women60'})
women_above60 = women_above60.set_index('tract')
persons = [persons_tract_tot,persons_tract_age, persons_tract_sex, persons_tract_race,persons_tract_hisp,women_above60]
persons_synth = reduce(lambda left,right: pd.merge(left,right,on="tract"),persons)

## adding prefix to columns
persons_synth.columns = ['s_' + str(col) for col in persons_synth.columns]
persons_synth= persons_synth.rename(columns = {"s_no":"s_nonhispanic","s_yes":"s_hispanic"})

### Combining with ACS estimates and Margins of error

In [7]:
# loading in acs estimates
ACS_people = pd.read_csv("data_outputs/20190330_census_aggregates/37119_people_meck.csv", converters={'tract': lambda x: str(x)})
ACS_households = pd.read_csv("data_outputs/20190330_census_aggregates/37119_households_meck.csv", converters={'tract': lambda x: str(x)})

In [8]:
# merging acs estimates with synthetics
people = ACS_people.merge(persons_synth,how='outer',on='tract')
households = ACS_households.merge(households_synth, how='outer',
                                  on='tract')

In [9]:
# creating a geo id to assist with gis table join 
people['geoid'] = '37119' + people['tract'].astype(str)
households['geoid'] = '37119' + households['tract'].astype(str)

In [10]:
people.columns
# households.columns

Index(['tract', '19 and under', '19 and under_me', '20 to 35', '20 to 35_me',
       '35 to 60', '35 to 60_me', 'above 60', 'above 60_me', 'women_60',
       'women_60_me', 'nonhispanic', 'nonhispanic_me', 'hispanic',
       'hispanic_me', 'total', 'total_me', 'asian', 'asian_me', 'black',
       'black_me', 'other', 'other_me', 'white', 'white_me', 'female',
       'female_me', 'male', 'male_me', 's_total', 's_19 and under',
       's_20 to 35', 's_35 to 60', 's_above 60', 's_female', 's_male',
       's_asian', 's_black', 's_other', 's_white', 's_nonhispanic',
       's_hispanic', 's_women60', 'geoid'],
      dtype='object')

In [11]:
# computing people difference columns
people['d_19und']= people['s_19 and under']-people['19 and under']
people['d_202035']= people['s_20 to 35'] - people['20 to 35'] 
people['d_35to60']= people['s_35 to 60'] - people['35 to 60']
people['d_60up']= people['s_above 60']- people['above 60']
people['d_hispanic']= people['s_hispanic']-people['hispanic']
people['d_total']= people['s_total']- people['total']
people['d_white']= people['s_white']-people['white']
people['d_asian']= people['s_asian']-people['asian']
people['d_black']= people['s_black']-people['black']
people['d_other']= people['s_other']-people['other']
people['d_female']= people['s_female']-people['female']
people['d_male']= people['s_male']-people['male']
people['d_total_moe']= people['total_me'] - abs(people['d_total'])
people['d_women60'] = people['s_women60']-people['women_60']
people['d_women60_moe'] = abs(people['d_women60'])-people['women_60_me']

In [12]:
# computing household difference columns
households['d_none_car']= households['s_none']-households['none']
households['d_one_car']=households['s_one']-households['one']
households['d_twomore_car']=households['s_two or more']-households['two or more']
households['d_totalhh']= households['s_total']-households['total']
households['d_lt30']=households['s_lt30']-households['lt30']
households['d_gt30-lt60']=households['s_gt30-lt60']-households['gt30-lt60']
households['d_gt60-lt100']=households['s_gt60-lt100']-households['gt60-lt100']
households['d_gt100-lt150']=households['s_gt100-lt150']-households['gt100-lt150']
households['d_totalhh_moe']= abs(households['d_totalhh'])- households['total_me']

In [13]:
# exporting combined tables to csv for GIS work
people.to_csv("data_outputs/20190330_ACS_SyntheticsForGIS/37119_people_var_tract.csv")
households.to_csv("data_outputs/20190330_ACS_SyntheticsForGIS/37119_households_var_tract.csv")

In [14]:
households.columns

Index(['tract', 'none', 'none_me', 'one', 'one_me', 'two or more',
       'two or more_me', 'total', 'total_me', 'gt100-lt150', 'gt100-lt150_me',
       'gt150', 'gt150_me', 'gt30-lt60', 'gt30-lt6_me', 'gt60-lt100',
       'gt60-lt100_me', 'lt30', 'lt30_me', 's_total', 's_gt100-lt150',
       's_gt150', 's_gt30-lt60', 's_gt60-lt100', 's_lt30', 's_none', 's_one',
       's_two or more', 'geoid', 'd_none_car', 'd_one_car', 'd_twomore_car',
       'd_totalhh', 'd_lt30', 'd_gt30-lt60', 'd_gt60-lt100', 'd_gt100-lt150',
       'd_totalhh_moe'],
      dtype='object')

In [15]:
# creating abridged datasets
people['d_abs_tot'] = abs(people['d_total'])
people['d_abs_women60'] = abs(people['d_women60'])

households['d_abs_tothh'] = abs(households['d_totalhh'])

people_abridged = people.drop(columns=['19 and under', '19 and under_me', '20 to 35', '20 to 35_me',
       '35 to 60', '35 to 60_me', 'above 60', 'above 60_me','nonhispanic', 'nonhispanic_me', 'hispanic',
       'hispanic_me', 'asian', 'asian_me', 'black',
       'black_me', 'other', 'other_me', 'white', 'white_me', 'female',
       'female_me', 'male', 'male_me', 's_19 and under',
       's_20 to 35', 's_35 to 60', 's_above 60', 's_female', 's_male',
       's_asian', 's_black', 's_other', 's_white', 's_nonhispanic',
       's_hispanic',  'd_19und', 'd_202035', 'd_35to60',
       'd_60up', 'd_hispanic', 'd_white', 'd_asian', 'd_black',
       'd_other', 'd_female', 'd_male'])

households_abridged = households.drop(columns = ['none', 'none_me', 'one', 'one_me', 'two or more',
       'two or more_me','gt100-lt150', 'gt100-lt150_me',
       'gt150', 'gt150_me', 'gt30-lt60', 'gt30-lt6_me', 'gt60-lt100',
       'gt60-lt100_me', 'lt30', 'lt30_me','s_gt100-lt150',
       's_gt150', 's_gt30-lt60', 's_gt60-lt100', 's_lt30', 's_none', 's_one',
       's_two or more','geoid', 'd_none_car', 'd_one_car', 'd_twomore_car','d_lt30', 
                                                 'd_gt30-lt60', 'd_gt60-lt100', 'd_gt100-lt150'])
households_abridged = households_abridged.rename(columns = {'total':'tothh', 'total_me':'tothh_me','s_total':'s_tothh','d_totalhh_moe':'d_tothh_ae_me'})
people_abridged = people_abridged.rename(columns = {'d_total_moe':'d_tot_ae_me', 'd_women60_moe':'d_women60_ae_me' })

synthetic_people_households = people_abridged.merge(households_abridged, how='outer', on='tract')

In [16]:
# exporting combined tables to csv for GIS work
people_abridged.to_csv("data_outputs/20190408_ACS_SyntheticsForGIS/37119_people_var_tract.csv")
households_abridged.to_csv("data_outputs/20190408_ACS_SyntheticsForGIS/37119_households_var_tract.csv")
synthetic_people_households.to_csv("data_outputs/20190408_ACS_SyntheticsForGIS/37119_people_households_var_tract.csv")

In [17]:
synthetic_people_households 

Unnamed: 0,tract,women_60,women_60_me,total,total_me,s_total,s_women60,geoid,d_total,d_tot_ae_me,d_women60,d_women60_ae_me,d_abs_tot,d_abs_women60,tothh,tothh_me,s_tothh,d_totalhh,d_tothh_ae_me,d_abs_tothh
0,000100,129,170,4931,392,4017.0,215.0,37119000100,-914.0,-522.0,86.0,-84.0,914.0,86.0,2825,243,2820.0,-5.0,-238.0,5.0
1,000300,27,83,645,85,614.0,44.0,37119000300,-31.0,54.0,17.0,-66.0,31.0,17.0,481,48,481.0,0.0,-48.0,0.0
2,000400,50,112,2640,194,2416.0,90.0,37119000400,-224.0,-30.0,40.0,-72.0,224.0,40.0,1764,129,1762.0,-2.0,-127.0,2.0
3,000500,163,210,4982,422,3940.0,203.0,37119000500,-1042.0,-620.0,40.0,-170.0,1042.0,40.0,2495,176,2492.0,-3.0,-173.0,3.0
4,000600,149,183,2872,247,2575.0,116.0,37119000600,-297.0,-50.0,-33.0,-150.0,297.0,33.0,1569,105,1567.0,-2.0,-103.0,2.0
5,000700,75,146,813,83,680.0,46.0,37119000700,-133.0,-50.0,-29.0,-117.0,133.0,29.0,359,44,359.0,0.0,-44.0,0.0
6,000800,108,120,2892,313,2375.0,123.0,37119000800,-517.0,-204.0,15.0,-105.0,517.0,15.0,931,56,929.0,-2.0,-54.0,2.0
7,000900,170,137,1768,204,1502.0,169.0,37119000900,-266.0,-62.0,-1.0,-136.0,266.0,1.0,715,58,713.0,-2.0,-56.0,2.0
8,001000,167,145,2563,168,2452.0,163.0,37119001000,-111.0,57.0,-4.0,-141.0,111.0,4.0,1192,63,1189.0,-3.0,-60.0,3.0
9,001100,86,104,2269,201,2074.0,104.0,37119001100,-195.0,6.0,18.0,-86.0,195.0,18.0,932,68,930.0,-2.0,-66.0,2.0


In [18]:
households_abridged

Unnamed: 0,tract,tothh,tothh_me,s_tothh,d_totalhh,d_tothh_ae_me,d_abs_tothh
0,000100,2825,243,2820.0,-5.0,-238.0,5.0
1,000300,481,48,481.0,0.0,-48.0,0.0
2,000400,1764,129,1762.0,-2.0,-127.0,2.0
3,000500,2495,176,2492.0,-3.0,-173.0,3.0
4,000600,1569,105,1567.0,-2.0,-103.0,2.0
5,000700,359,44,359.0,0.0,-44.0,0.0
6,000800,931,56,929.0,-2.0,-54.0,2.0
7,000900,715,58,713.0,-2.0,-56.0,2.0
8,001000,1192,63,1189.0,-3.0,-60.0,3.0
9,001100,932,68,930.0,-2.0,-66.0,2.0
