### Main script to clean ZCTA and Census Tract data

Modules: N/A
Author: Cornelia Ilin <br>
Date: Feb 15, 2022 <br>
Email: cilin@stanford.edu


#### Citations (online sources):
1. Geometry of ZIP codes in California (2010 boundaries) <br>
    https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2010&layergroup=ZIP+Code+Tabulation+Areas

2. Geometry for County codes in California (2010 boundaries) <br>
    https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2010&layergroup=Counties+%28and+equivalent%29
    
3. ZCTA to county codes in California (2010 boundaries) <br>
    https://www.census.gov/programs-surveys/geography/technical-documentation/records-layout/2010-zcta-record-layout.html

### Step 1: Import packages

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os

import warnings
warnings.filterwarnings("ignore")

### Step 2: Set working directories

In [None]:
in_dir_root = 'C:/Users/cilin/Research/CA_Hospitals/Input/raw_data/census_geo/'
in_dir_shp_zcta = in_dir_root + "shapefiles_zcta/"
in_dir_shp_county = in_dir_root + "shapefiles_county/"
in_dir_zcta_to_county = in_dir_root + "zcta_to_county/"

### Step 3: Define functions

``read data``

In [None]:
def read_zcta_geom():
    ''' Read ZCTA geometry for California, 2010 Census boundaries
    '''
    # read the shapefiles for Califoria's ZIP codes
    for file in os.listdir(in_dir_shp_zcta):
        if file.endswith('.shp'):
            gdf = gpd.read_file(
                in_dir_shp_zcta + file
            )
    return gdf

In [None]:
def read_county_geom():
    '''Read county geometry for California, 2016 Census bounderies
    '''
    # read the shapefiles for California's county codes
    for file in os.listdir(in_dir_shp_county):
        if file.endswith('.shp'):
            gdf = gpd.read_file(
                in_dir_shp_county + file
            )
            
    return gdf

In [None]:
def read_zcta_to_county():
    """ Read ZCTA to County files, 2010 Census
    params:
    -------
    None
    
    return:
    -------
    df with ZCTA to County info
    """
    # Read the shapefiles for California's ZIP codes
    for file in os.listdir(in_dir_zcta_to_county):
        if file.endswith('.csv'):
            df = pd.read_csv(in_dir_zcta_to_county + file)

    return df

``clean data``

In [None]:
def clean_zcta_geom(gdf):
    ''''''
    # rename, drop duplicates, sort, reset index
    gdf.rename(
        columns={'ZCTA5CE10': 'ZCTA10'},
        inplace=True
    )
    
    gdf.drop_duplicates(
        subset=['ZCTA10'],
        inplace=True
    )
    
    gdf.sort_values(
        by=['ZCTA10'],
        inplace=True
    )
    
    gdf.reset_index(
        drop=True, 
        inplace=True
    )
    
    # tranform ZCTA10 to str
    gdf['ZCTA10'] = gdf.ZCTA10.astype(str)
    
    # add ZCTA10 centroid
    gdf['ZCTA10_centroid'] = gdf.to_crs('+proj=cea').centroid.to_crs(gdf.crs)    
    
    gdf.rename(
        columns={'geometry':'ZCTA10_geometry'},
        inplace=True
    )
    
    
    gdf.rename(
        columns={'geometry':'ZCTA10_geometry'},
        inplace=True
    )
    # keep only cols of interest
    gdf = gdf[['ZCTA10', 'ZCTA10_geometry', 'ZCTA10_centroid']]
    
    return gdf

In [None]:
def clean_county_geom(gdf):
    ''''''
    ''''''
    # rename, drop duplicates, sort, reset index
    gdf.rename(
        columns={
            'COUNTYFP10': 'CNTY10',
            'NAME10':'CNTY10_name'
        
        },
        inplace=True
    )
    
    gdf.drop_duplicates(
        subset=['CNTY10'],
        inplace=True
    )
    
    gdf.sort_values(
        by=['CNTY10'],
        inplace=True
    )
    
    gdf.reset_index(
        drop=True, 
        inplace=True
    )
    
    # tranform CNTY10 to str
    gdf['CNTY10'] = gdf.CNTY10.astype(str)
    
    # add CNTY10 centroid
    # use Equal area cylindrical projection:
    # https://gis.stackexchange.com/questions/372564/userwarning-when-trying-to-get-centroid-from-a-polygon-geopandas
    gdf['CNTY10_centroid'] = gdf.to_crs('+proj=cea').centroid.to_crs(gdf.crs)
    
    gdf.rename(
        columns={'geometry':'CNTY10_geometry'},
        inplace=True
    )
    
    # keep only cols of interest
    gdf = gdf[['CNTY10','CNTY10_name', 'CNTY10_geometry', 'CNTY10_centroid']]
    
    
    return gdf

In [None]:
def clean_zcta_to_county(df):
    '''
    '''
    # keep only state of CA
    df = df[df.STATE==6]

    # rename columns
    df.rename(
        columns={
            'ZCTA5': 'ZCTA10',
            'COUNTY': 'CNTY10',
            'ZPOP': 'ZCTA10_POP',
            'COPOP': 'CNTY10_POP'
        },
        inplace=True
    )

    # transform ZCTA10 and CNTY10 to string
    df['ZCTA10'] = df.ZCTA10.astype(str)
    df['CNTY10'] = df.CNTY10.astype(str)
    
     # transform CNTY10 into 3 digits
    df['CNTY10'] = np.where(df.CNTY10.str.len()==1, '00' + df.CNTY10, 
                                  np.where(df.CNTY10.str.len()==2, '0' + df.CNTY10,
                                          df.CNTY10))


    # sort by county code
    df.sort_values(by=['CNTY10'], inplace=True)

    # reset index
    df.reset_index(drop=True, inplace=True)
    
    # keep only cols of interest
    df = df[['ZCTA10', 'CNTY10', 'ZCTA10_POP', 'CNTY10_POP']]
    
    return df

---
### Step 4: Read data

``geometry``

In [None]:
gdf_zcta = read_zcta_geom()
gdf_zcta.head(2)

In [None]:
gdf_county = read_county_geom()
gdf_county.head(2)

In [None]:
df_zcta_to_county = read_zcta_to_county()
df_zcta_to_county.head(2)

---
### Step 5: Preprocess data

``geometry``

In [None]:
gdf_zcta = clean_zcta_geom(gdf_zcta)
gdf_zcta.head(2)

In [None]:
gdf_zcta[gdf_zcta.ZCTA10.eq('91941')]

In [None]:
gdf_county = clean_county_geom(gdf_county)
gdf_county.head(2)

In [None]:
df_zcta_to_county = clean_zcta_to_county(df_zcta_to_county)
df_zcta_to_county.head(2)

---
### Step 6: Merge data

In [None]:
# merge df_zcta_to_county and gdf_county
gdf_county = gdf_county.merge(
    df_zcta_to_county,
    on='CNTY10', 
    how='right'
)
gdf_county.head(2)

In [None]:
# merge gdf_county and gdf_zcta
gdf_zcta_county = gdf_zcta.merge(
    gdf_county, 
    on=['ZCTA10'],
    how='left'
)
gdf_zcta_county.head(2)