In [1]:
import pandas as pd
import numpy as np
import os
import glob

In [2]:
from r_codes import geo_col,col1,col2,col3
from geo import stco_fips, metro_codes

In [3]:
col_head = {'o':geo_col,'1':col1,'2':col2,'3':col3}
col_join = ['LOGRECNO','STUSAB','FILEID','CHARITER']
col_data = ['STATE','COUNTY','GEOCODE','SUMLEV','POP100','HU100','P0010001','H0010001','H0010002','H0010003','NAME']

In [4]:
# set directory
folders = glob.glob('../data/red_metro_20/*')

### data pulling functions for regional data

In [5]:
def make_state(state):
    files = glob.glob(f'../data/red_metro_20/{state}2020.pl/*.pl')
    #for first file in folder
    df = pd.read_table(f'{files[0]}',sep='|',header=None,low_memory=False,encoding = 'unicode_escape')
    df.columns = col_head[files[0][-8]]

    #for all other files
    for file in files[1:]:
        dff = pd.read_table(f'{file}',sep='|',header=None,low_memory=False,encoding = 'unicode_escape')
        dff.columns = col_head[file[-8]]
        df = pd.merge(df,dff,left_on=col_join,right_on=col_join,how="inner")
    
    #reduce table size
    df = df[col_data] #just the data columns we need
    df = df[df.SUMLEV==50].copy() #just county data
    return df

In [6]:
def make_table(folders):
    df = pd.DataFrame()
    for folder in folders:
        state = folder[-9:-7]
        dff = make_state(state)
        df = pd.concat([df,dff])
    return df

### make various tables

In [7]:
#master table for counties
master = make_table(folders)

In [8]:
master.head()

Unnamed: 0,STATE,COUNTY,GEOCODE,SUMLEV,POP100,HU100,P0010001,H0010001,H0010002,H0010003,NAME
1,25,1.0,25001,50,228996,164885,228996,164885,103368,61517,Barnstable County
2,25,3.0,25003,50,129026,69759,129026,69759,57015,12744,Berkshire County
3,25,5.0,25005,50,579200,243464,579200,243464,229293,14171,Bristol County
4,25,7.0,25007,50,20600,17530,20600,17530,8932,8598,Dukes County
5,25,9.0,25009,50,809829,327185,809829,327185,309030,18155,Essex County


In [9]:
# pull in us metro list
geo = pd.read_csv('../data/geo/usmetros_cnty.csv')
geo['stco_id'] = geo['stco'].apply(lambda x:'{0:0>5}'.format(x))

In [10]:
cty = pd.merge(geo,master,left_on='stco_id',right_on='GEOCODE',how='left')

In [11]:
cty.head()

Unnamed: 0,csa_id,csa_name,st_id,co_id,stco,co_name,st_name,stco_id,STATE,COUNTY,GEOCODE,SUMLEV,POP100,HU100,P0010001,H0010001,H0010002,H0010003,NAME
0,122,"Atlanta--Athens-Clarke County--Sandy Springs, ...",1,17,1017,Chambers,Alabama,1017,1,17.0,1017,50,34772,16373,34772,16373,14238,2135,Chambers County
1,429,"Phoenix-Mesa, AZ",4,7,4007,Gila,Arizona,4007,4,7.0,4007,50,53272,32373,53272,32373,22312,10061,Gila County
2,429,"Phoenix-Mesa, AZ",4,21,4021,Pinal,Arizona,4021,4,21.0,4021,50,425264,172878,425264,172878,146663,26215,Pinal County
3,429,"Phoenix-Mesa, AZ",4,13,4013,Maricopa,Arizona,4013,4,13.0,4013,50,4420568,1812827,4420568,1812827,1643579,169248,Maricopa County
4,472,"Sacramento-Roseville, CA",6,57,6057,Nevada,California,6057,6,57.0,6057,50,102241,53627,102241,53627,42774,10853,Nevada County


In [12]:
metro = cty[['csa_id','csa_name','P0010001','H0010001','H0010002','H0010003']]
metro = metro.groupby(['csa_id','csa_name']).sum().reset_index()

In [13]:
metro

Unnamed: 0,csa_id,csa_name,P0010001,H0010001,H0010002,H0010003
0,122,"Atlanta--Athens-Clarke County--Sandy Springs, ...",6930423,2753914,2570167,183747
1,148,"Boston-Worcester-Providence, MA-RI-NH-CT",8466186,3604952,3319384,285568
2,172,"Charlotte-Concord, NC-SC",2822352,1179478,1098647,80831
3,176,"Chicago-Naperville, IL-IN-WI",9986960,4107480,3819959,287521
4,184,"Cleveland-Akron-Canton, OH",3633962,1662778,1525555,137223
5,206,"Dallas-Fort Worth, TX-OK",8121108,3163911,2947316,216595
6,216,"Denver-Aurora, CO",3623560,1503302,1414008,89294
7,220,"Detroit-Warren-Ann Arbor, MI",5424742,2350647,2176590,174057
8,288,"Houston-The Woodlands, TX",7312270,2821424,2577581,243843
9,348,"Los Angeles-Long Beach, CA",18644680,6595294,6205567,389727


In [14]:
metro.to_csv('output/metros_20.csv')