## Notebook to compile data for geographies in the NYC Metro from Census redistricting files

#### Requires local storage of Census zip files for each state as .pl format and place in folder directory

In [21]:
import pandas as pd
import numpy as np
import os
import glob

In [22]:
from r_codes import geo_col,col1,col2,col3
from geo import stco_fips,metro_codes,sub_7,sub_lbl

In [23]:
# if pulling different variables, replace data columns in col_data
col_head = {'o':geo_col,'1':col1,'2':col2,'3':col3}
col_join = ['LOGRECNO','STUSAB','FILEID','CHARITER']
col_data = ['STATE','COUNTY','GEOCODE','NAME','SUMLEV',\
            'P0010001','P0020002','P0020003','P0020005','P0020006',\
            'P0020007','P0020008','P0020009','P0020010','P0020011',\
            'P0040001','H0010001','H0010002','H0010003']
sumlev = [40,50,60,160,140]

In [24]:
col_recode = {'GEOCODE':'id','NAME':'name','P0010001':'P_Tot','P0020002':'P_Hisp','P0020003':'P_NonHisp',\
              'P0020005':'P_White','P0020006':'P_Black','P0020007':'P_Other','P0020008':'P_Asian',\
              'P0020009':'P_Other','P0020010':'P_Other','P0020011':'P_Other','H0010001':'H_Tot',\
              'P0040001':'P_18p','H0010002':'H_Occ','H0010003':'H_Vac'}

In [25]:
# set directory
folders = glob.glob('../data/red_20/*')

In [26]:
nyc_recode = pd.read_csv('../data/geo/nyc_subbor_20.csv')

#make adjusted subpl for calculation comparison over time
subpl_recode = pd.read_csv('../data/geo/subpl20.csv')

### data pulling functions for regional data

In [27]:
def make_state(state):
    files = glob.glob(f'../data/red_20/{state}2020.pl/*.pl')
    #for first file in folder
    df = pd.read_table(f'{files[0]}',sep='|',header=None,low_memory=False)
    df.columns = col_head[files[0][-8]]

    #for all other files
    for file in files[1:]:
        dff = pd.read_table(f'{file}',sep='|',header=None,low_memory=False)
        dff.columns = col_head[file[-8]]
        df = pd.merge(df,dff,left_on=col_join,right_on=col_join,how="inner")
    
    #reduce table size
    df = df[col_data] #just the data columns we need
    df = df[df.SUMLEV.isin(sumlev)].copy() #just the geo types we need
    df = df[df.GEOCODE.isin(metro_codes)].copy() #just the places in our region
    return df

In [28]:
def make_table(folders):
    df = pd.DataFrame()
    for folder in folders:
        state = folder[15:17]
        dff = make_state(state)
        df = pd.concat([df,dff])
    return df

### make initial table, clean up each geo level for master

In [29]:
#master regional table with counties, munis, and NYC tracts
df = make_table(folders)

In [30]:
df = df.rename(columns=col_recode)
df = df.groupby(df.columns,axis=1).sum()
df['STATE']= df['STATE'].astype(str).str.pad(width=2,side='left',fillchar='0')
df['COUNTY']= df['COUNTY'].astype(int).astype(str).str.pad(width=3,side='left',fillchar='0')
df['id']=df['id'].astype(int)

In [31]:
## make county table
co = df[df['SUMLEV']==50].copy()
co['stco'] = co.STATE + co.COUNTY
co = co.drop(columns=['STATE','COUNTY','SUMLEV'])
co['id']=co.id.astype(str).str.pad(width=5,side='left',fillchar='0')
co['subreg'],co['geotype']=co['stco'].map(sub_7),'county'
#county.to_csv('output/2020/pop_race_hou_co_reg.csv')

In [32]:
# make subregion table
sub = co.copy()
sub = sub.drop(columns=['id','stco','name']).groupby(['subreg'],dropna=False).sum().reset_index()
sub['name'],sub['id'],sub['geotype']=sub.subreg.map(sub_lbl),sub['subreg'],'subregion'

In [33]:
## make nyc subborough table
nyc = df[df['SUMLEV']==140].copy()
nyc = nyc.drop(columns=['STATE','COUNTY','name','SUMLEV'])
nyc = pd.merge(nyc_recode,nyc,left_on='ct_id',right_on='id',how='left')
nyc = nyc.drop(columns=['ct_id','ct_name','id']).groupby(['stco','id_sub','name']).sum().reset_index().rename(columns={'id_sub':'id'})
nyc['geotype'],nyc['subreg']='municipality',nyc.stco.astype(str).map(sub_7)

In [34]:
## make subplace table
subpl = df[(df['SUMLEV']==160) | (df['SUMLEV']==60)]

#export the 2020 geographies - not adjusted for longitudinal consistency
subpl.to_csv('output/2020/pop_race_hou_subpl20.csv')

#table for longitudinal analysis
subpl = subpl.drop(columns=['STATE','COUNTY','SUMLEV','name'])
subpl = pd.merge(subpl_recode,subpl,left_on='geoid20',right_on='id',how='left')
subpl = subpl.drop(columns=['geoid20','name20','id_y',]).groupby(['stco','id_x','name']).sum().reset_index().rename(columns={'id_x':'id'})
subpl['geotype'],subpl['subreg']='municipality',subpl.stco.astype(str).str.pad(width=5,side='left',fillchar='0').map(sub_7)

### make a national file

In [35]:
us_files = glob.glob(f'../data/us2020.npl/*.pl')
#for first file in folder
us = pd.read_table(f'{us_files[0]}',sep='|',header=None,low_memory=False,encoding = 'unicode_escape')
us.columns = col_head[us_files[0][-8]]

#for all other files
for file in us_files[1:]:
    dff = pd.read_table(f'{file}',sep='|',header=None,low_memory=False,encoding = 'unicode_escape')
    dff.columns = col_head[file[-8]]
    us = pd.merge(us,dff,left_on=col_join,right_on=col_join,how="inner")

#reduce table size
us = us[us.SUMLEV==10]
us = us[col_data].rename(columns=col_recode).drop(columns=['STATE','COUNTY','SUMLEV'])
us = us.groupby(us.columns,axis=1).sum()

us['geotype']='nation'

## make master 2020 table

In [36]:
master20 = pd.concat([co,sub,nyc,subpl,us])

In [37]:
id_col = ['id','name','stco','subreg']
master20 = master20[id_col+[col for col in master20.columns if col not in id_col]]
master20['yr']='2020'

In [38]:
master20.to_csv('output/2020/pop_race_hou_allgeos.csv')

# COMBINE WITH 2000 & 2010

In [39]:
master0010 = pd.read_csv('output/2000_2010/pop_race_hou_allgeos.csv').drop(columns='Unnamed: 0')
#clean for table build
master0010.stco = master0010['stco'].astype('Int64').astype(str).str.pad(width=5,side='left',fillchar='0')
master0010.yr = master0010.yr.astype(str)

In [40]:
master = pd.concat([master20,master0010])
master.to_csv('output/pop_race_hou_allgeos.csv')