## Notebook to compile data for geographies in the NYC Metro from Census redistricting files

#### Requires local storage of Census zip files for each state as .pl format and place in folder directory

In [1]:
import pandas as pd
import numpy as np
import os
import glob

In [2]:
from r_codes import geo_col,col1,col2,col3
from geo import stco_fips, metro_codes

In [3]:
# if pulling different variables, replace data columns in col_data
col_head = {'o':geo_col,'1':col1,'2':col2,'3':col3}
col_join = ['LOGRECNO','STUSAB','FILEID','CHARITER']
col_data = ['STATE','COUNTY','GEOCODE','SUMLEV','POP100','HU100','P0010001',\
            'P0020001','P0020002','P0020003','P0020004','P0020005','P0020006','P0020007',\
            'P0020008','P0020009','P0020010','P0020011',\
            'H0010001','H0010002','H0010003','NAME']
sumlev = [40,50,60,160,140]

In [4]:
# set directory
folders = glob.glob('../data/red_20/*')

### data pulling functions for regional data

In [5]:
def make_state(state):
    files = glob.glob(f'../data/red_20/{state}2020.pl/*.pl')
    #for first file in folder
    df = pd.read_table(f'{files[0]}',sep='|',header=None,low_memory=False)
    df.columns = col_head[files[0][-8]]

    #for all other files
    for file in files[1:]:
        dff = pd.read_table(f'{file}',sep='|',header=None,low_memory=False)
        dff.columns = col_head[file[-8]]
        df = pd.merge(df,dff,left_on=col_join,right_on=col_join,how="inner")
    
    #reduce table size
    df = df[col_data] #just the data columns we need
    df = df[df.SUMLEV.isin(sumlev)].copy() #just the geo types we need
    df = df[df.GEOCODE.isin(metro_codes)].copy() #just the places in our region
    return df

In [6]:
def make_table(folders):
    df = pd.DataFrame()
    for folder in folders:
        state = folder[15:17]
        dff = make_state(state)
        df = pd.concat([df,dff])
    return df

### make various tables

In [7]:
#master regional table with counties, munis, and NYC tracts
master = make_table(folders)

In [8]:
county = master[master['SUMLEV']==50].copy()
county.to_csv('output/counties_20.csv')

In [9]:
#nyc tract to subborrough
nyc = master[master['SUMLEV']==140].copy()

In [10]:
nyc_recode = pd.read_csv('../data/geo/nyc_subbor_20.csv')
nyc_recode['GEOID20']=nyc_recode['GEOID20'].astype(str)

nyc = pd.merge(nyc_recode,nyc,left_on='GEOID20',right_on='GEOCODE',how='left')
nyc_sub = nyc[['STATEFP20','COUNTYFP20','Subbor_id','Subbor_nm','POP100','HU100','P0010001',\
            'P0020001','P0020002','P0020003','P0020004','P0020005','P0020006','P0020007',\
            'P0020008','P0020009','P0020010','P0020011',\
            'H0010001','H0010002','H0010003']]

In [11]:
#update columns for consistency with subpl table
nyc_sub['stco'] = nyc_sub['STATEFP20'].astype(str) + nyc_sub['COUNTYFP20'].astype(str).apply(lambda x:'{0:0>3}'.format(x))
nyc_sub = nyc_sub.rename(columns={'Subbor_id':'geoid','Subbor_nm':'name'})
nyc_sub = nyc_sub.drop(columns=['STATEFP20','COUNTYFP20'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyc_sub['stco'] = nyc_sub['STATEFP20'].astype(str) + nyc_sub['COUNTYFP20'].astype(str).apply(lambda x:'{0:0>3}'.format(x))


In [12]:
nyc_sub = nyc_sub.groupby(['stco','geoid','name']).sum().reset_index()

In [13]:
nyc_sub.head()


Unnamed: 0,stco,geoid,name,POP100,HU100,P0010001,P0020001,P0020002,P0020003,P0020004,P0020005,P0020006,P0020007,P0020008,P0020009,P0020010,P0020011,H0010001,H0010002,H0010003
0,36005,36005CS,BX Central and South,488720,176934,488720,488720,275049,213671,203837,36205,121583,1227,38360,157,6305,9834,176934,169664,7270
1,36005,36005NE,BX North and East,272546,108236,272546,272546,87739,184807,177983,40332,125441,600,8026,95,3489,6824,108236,102721,5515
2,36005,36005W,BX West,711388,261860,711388,711388,443675,267713,256001,54259,172369,1260,21380,205,6528,11712,261860,250065,11795
3,36047,36047C,BK Central,595377,233830,595377,595377,58718,536659,504091,112351,360365,948,22709,139,7579,32568,233830,219831,13999
4,36047,36047E,BK East,303777,111587,303777,303777,96051,207726,193980,9930,162190,1011,16180,99,4570,13746,111587,105312,6275


In [14]:
subpl20 = master[(master['SUMLEV']==160) | (master['SUMLEV']==60)]
subpl20.to_csv('output/subpl_20.csv') #2020 geos

In [15]:
#make adjusted subpl for calculation comparison over time
subpl_recode = pd.read_csv('../data/geo/subpl20.csv')

In [16]:
subpl_recode['geoid'] = subpl_recode['geoid'].astype(str)

In [17]:
subpl20 = subpl20.copy()
subpl20['joinid'] = subpl20['GEOCODE'].astype(int)

In [18]:
subpl_adj = pd.merge(subpl_recode,subpl20,left_on='id_20',right_on='joinid',how='left')


In [19]:
subpl_adj = subpl_adj[['stco','geoid','name','POP100','HU100','P0010001',\
            'P0020001','P0020002','P0020003','P0020004','P0020005','P0020006','P0020007',\
            'P0020008','P0020009','P0020010','P0020011',\
            'H0010001','H0010002','H0010003']]
subpl_adj = subpl_adj.groupby(['stco','geoid','name']).sum().reset_index()


In [20]:
subpl_final = pd.concat([nyc_sub,subpl_adj])
subpl_final.to_csv('output/subpl_20_adj.csv')


## national file

In [21]:
us_files = glob.glob(f'../data/us2020.npl/*.pl')
#for first file in folder
us = pd.read_table(f'{us_files[0]}',sep='|',header=None,low_memory=False,encoding = 'unicode_escape')
us.columns = col_head[us_files[0][-8]]

#for all other files
for file in us_files[1:]:
    dff = pd.read_table(f'{file}',sep='|',header=None,low_memory=False,encoding = 'unicode_escape')
    dff.columns = col_head[file[-8]]
    us = pd.merge(us,dff,left_on=col_join,right_on=col_join,how="inner")

#reduce table size
us = us[col_data] #just the data columns we need

In [22]:
us.head()

Unnamed: 0,STATE,COUNTY,GEOCODE,SUMLEV,POP100,HU100,P0010001,P0020001,P0020002,P0020003,...,P0020006,P0020007,P0020008,P0020009,P0020010,P0020011,H0010001,H0010002,H0010003,NAME
0,,,,10,331449281,140498736,331449281,331449281,62080044,269369237,...,39940338,2251699,19618719,622018,1689833,13548983,140498736,126817580,13681156,United States
1,,,1.0,20,57609148,24716516,57609148,57609148,8794868,48814280,...,6207939,112024,4143379,15069,465485,2117386,24716516,22371124,2345392,Northeast Region
2,,,2.0,20,68985454,30458979,68985454,68985454,5978786,63006668,...,7111553,382776,2385691,38022,242361,2763713,30458979,27564312,2894667,Midwest Region
3,,,3.0,20,126266107,54542603,126266107,126266107,23696746,102569361,...,23153574,722756,4757911,100255,565727,4875703,54542603,48613340,5929263,South Region
4,,,4.0,20,78588572,30780638,78588572,78588572,23609644,54978928,...,3467272,1034143,8331738,468672,416260,3792181,30780638,28268804,2511834,West Region


In [23]:
us.to_csv('output/us_20.csv')