In [64]:
import pandas as pd
import numpy as np
import os
import glob

In [65]:
from r_codes import geo_col,col1,col2,col3
from geo import stco_fips, metro_codes

In [66]:
col_head = {'o':geo_col,'1':col1,'2':col2,'3':col3}
col_join = ['LOGRECNO','STUSAB','FILEID','CHARITER']
col_data = ['STATE','COUNTY','GEOCODE','SUMLEV','POP100','HU100','P0010001',\
            'H0010001','H0010002','H0010003','NAME']
sumlev = [40,50,60,160,140]

In [67]:
# set directory
folders = glob.glob('../data/red_20/*')

### data pulling functions for regional data

In [68]:
def make_state(state):
    files = glob.glob(f'../data/red_20/{state}2020.pl/*.pl')
    #for first file in folder
    df = pd.read_table(f'{files[0]}',sep='|',header=None,low_memory=False)
    df.columns = col_head[files[0][-8]]

    #for all other files
    for file in files[1:]:
        dff = pd.read_table(f'{file}',sep='|',header=None,low_memory=False)
        dff.columns = col_head[file[-8]]
        df = pd.merge(df,dff,left_on=col_join,right_on=col_join,how="inner")
    
    #reduce table size
    df = df[col_data] #just the data columns we need
    df = df[df.SUMLEV.isin(sumlev)].copy() #just the geo types we need
    df = df[df.GEOCODE.isin(metro_codes)].copy() #just the places in our region
    return df

In [69]:
def make_table(folders):
    df = pd.DataFrame()
    for folder in folders:
        state = folder[15:17]
        dff = make_state(state)
        df = pd.concat([df,dff])
    return df

### make various tables

In [70]:
#master regional table with counties, munis, and NYC tracts
master = make_table(folders)

In [71]:
county = master[master['SUMLEV']==50].copy()
county.to_csv('output/counties_20.csv')

In [72]:
#nyc tract to subborrough
nyc = master[master['SUMLEV']==140].copy()

In [73]:
nyc_recode = pd.read_csv('../data/geo/nyc_subbor_20.csv')
nyc_recode['GEOID20']=nyc_recode['GEOID20'].astype(str)

nyc = pd.merge(nyc_recode,nyc,left_on='GEOID20',right_on='GEOCODE',how='left')
nyc_sub = nyc[['STATEFP20','COUNTYFP20','Subbor_id','Subbor_nm','POP100','HU100',\
               'P0010001','H0010001','H0010002','H0010003',]]

In [74]:
nyc_sub = nyc_sub.groupby(['STATEFP20','COUNTYFP20','Subbor_id','Subbor_nm']).sum().reset_index()

In [75]:
nyc_sub.head()

Unnamed: 0,STATEFP20,COUNTYFP20,Subbor_id,Subbor_nm,POP100,HU100,P0010001,H0010001,H0010002,H0010003
0,36,5,36005CS,BX Central and South,488720,176934,488720,176934,169664,7270
1,36,5,36005NE,BX North and East,272546,108236,272546,108236,102721,5515
2,36,5,36005W,BX West,711388,261860,711388,261860,250065,11795
3,36,47,36047C,BK Central,595377,233830,595377,233830,219831,13999
4,36,47,36047E,BK East,303777,111587,303777,111587,105312,6275


In [76]:
#rename columns to join with subplace table
nyc_sub = nyc_sub.rename(columns={'STATEFP20':'STATE','COUNTYFP20':'COUNTY',\
                                            'Subbor_id':'geoid','Subbor_nm':'name'})

In [77]:
subpl20 = master[(master['SUMLEV']==160) | (master['SUMLEV']==60)]
subpl20.to_csv('output/subpl_20.csv') #2020 geos

In [78]:
#make adjusted subpl for calculation comparison over time
subpl_recode = pd.read_csv('../data/geo/subpl20.csv')

In [79]:
subpl_recode['geoid'] = subpl_recode['geoid'].astype(str)

In [80]:
subpl20 = subpl20.copy()
subpl20['joinid'] = subpl20['GEOCODE'].astype(int)

In [81]:
subpl_adj = pd.merge(subpl_recode,subpl20,left_on='id_20',right_on='joinid',how='left')

In [82]:
subpl_adj = subpl_adj[['stco','geoid','name','POP100','HU100','P0010001','H0010001','H0010002','H0010003']]
subpl_adj = subpl_adj.groupby(['stco','geoid','name']).sum().reset_index()

In [83]:
subpl_final = pd.concat([nyc_sub,subpl_adj])
subpl_final.to_csv('output/subpl_20_adj.csv')

## national file

In [87]:
us_files = glob.glob(f'../data/us2020.npl/*.pl')
#for first file in folder
us = pd.read_table(f'{us_files[0]}',sep='|',header=None,low_memory=False,encoding = 'unicode_escape')
us.columns = col_head[us_files[0][-8]]

#for all other files
for file in us_files[1:]:
    dff = pd.read_table(f'{file}',sep='|',header=None,low_memory=False,encoding = 'unicode_escape')
    dff.columns = col_head[file[-8]]
    us = pd.merge(us,dff,left_on=col_join,right_on=col_join,how="inner")

#reduce table size
us = us[col_data] #just the data columns we need

In [88]:
us.head()

Unnamed: 0,STATE,COUNTY,GEOCODE,SUMLEV,POP100,HU100,P0010001,H0010001,H0010002,H0010003,NAME
0,,,,10,331449281,140498736,331449281,140498736,126817580,13681156,United States
1,,,1.0,20,57609148,24716516,57609148,24716516,22371124,2345392,Northeast Region
2,,,2.0,20,68985454,30458979,68985454,30458979,27564312,2894667,Midwest Region
3,,,3.0,20,126266107,54542603,126266107,54542603,48613340,5929263,South Region
4,,,4.0,20,78588572,30780638,78588572,30780638,28268804,2511834,West Region


In [89]:
us.to_csv('output/us_20.csv')