To combine multiple data from different sources, I created a crosswalk between zipcode, city, county, and state. Each data are reported in different geographical unit, zipcode, city name, county, state, and census geographic codes.  
In this notebook, I go though the process of creating crosswalk. 

In [1]:
import requests

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings("ignore")


# Creating a crosswalk between Zip codes, Cities, Counties, States, and Census geographic codes  

###  ZIP to Congressional District 
- Donwloaded zip code to Core Based Statistical Areas (CBSA) (census geo code).  https://www.huduser.gov/portal/datasets/usps_crosswalk.html  

In [3]:
# Read data 

cbsa = pd.read_excel('Data/Crosswalk/ZIP_CBSA_092021.xlsx')
cbsadv = pd.read_excel('Data/Crosswalk/ZIP_CBSA_DIV_092021.xlsx')
cd = pd.read_excel('Data/Crosswalk/ZIP_CD_092021.xlsx') 
county = pd.read_excel('Data/Crosswalk/ZIP_COUNTY_092021.xlsx') 

# Drop unnecessary columns 
cbsa = cbsa.iloc[:, 0:4]
cbsadv = cbsadv.iloc[:, 0:4]
cd = cd.iloc[:, 0:4]
#county = county

# merge 
#df = county.merge(cbsadv, how='outer', on='ZIP')
#df = df.merge(cd, how='outer', on='ZIP')
#df = df.merge(cbsa, how='outer', on='ZIP')

#df.drop_duplicates()

In [None]:
# If a zipcode has multiple county names, keep the county name which has the highest total ratio. 

# Add a column which return the maximum tot_ratio in a same zipcode  
max_tot =  pd.DataFrame(county.groupby('ZIP').max()['TOT_RATIO'])
max_tot.reset_index(inplace=True)
county = county.merge(max_tot, on='ZIP', how='left')

# if total ratio is smaller than max, drop the zipcode. 
county = county[county.TOT_RATIO_x == county.TOT_RATIO_y]

In [None]:
# If tot_ratio is 50:50 split, use RES_RATIO to assign county name. 

# Add a column which return the maximum RES_ratio in a same zipcode  
max_res =  pd.DataFrame(county.groupby('ZIP').max()['RES_RATIO'])
max_res.reset_index(inplace=True)
county = county.merge(max_res, on='ZIP', how='left')

# if Res ratio is smaller than max, drop the zipcode. 
county = county[county.RES_RATIO_x == county.RES_RATIO_y]


In [None]:
# If tot_ratio and RES_RATIO is 50:50 split, use BUS_RATIO to assign county name. 

# Add a column which return the maximum RES_ratio in a same zipcode  
max_bus =  pd.DataFrame(county.groupby('ZIP').max()['BUS_RATIO'])
max_bus.reset_index(inplace=True)
county = county.merge(max_bus, on='ZIP', how='left')

# if Res ratio is smaller than max, drop the zipcode. 
county = county[county.BUS_RATIO_x == county.BUS_RATIO_y]

# Lastly, for ZIP code=51603, drop the second entry of county.   
county.drop(county[(county.ZIP==51603) & (county.COUNTY==19071)].index, inplace = True)

In [None]:
county = county.iloc[:, :4]

In [None]:
county.nunique()

In [None]:
county.shape

In [None]:
county[county.COUNTY==36061]

In [None]:
county.head()

## Merge county crosswalk to Eonomic Tracker GeoID-County, GeoID-City, because this data has latitude and logitude of city center. 

## Use GeoIDs-County.csv to link county information and zip codes

In [None]:
## Merge GeoIDs-County to zip_city_new using countyfips 

# Downloard GeoID-County data from Economic Tracker at https://github.com/OpportunityInsights/EconomicTracker/tree/main/data 
# Read the file
geoid_county = pd.read_csv('Data/EconomicTracker-main/data/GeoIDs - County.csv')

# merge geoid_county to county
county_zip = geoid_county.merge(county, how='right', left_on='countyfips', right_on='COUNTY')
#county_zip = geoid_county.merge(zip_city_new, how='left', on='countyfips')

#drop if zip codes are missing. 
county_zip.dropna(axis=0, subset=['ZIP'])


county_zip.head()

In [None]:
county_zip.nunique()

In [None]:
county_zip.shape

In [None]:
county_zip.value_counts('czname')

In [None]:
county_zip.value_counts('cityname')

## For a zipcode in a greater economic commercial zone, assign nearest city name and its lon&lat. 

In [None]:
# Use cities included in the GEOID-City as the large economic city in US. Which has top 53 cities. 
# For each zipcode in county_zip file, check the czname (economic zone named by the nearest city). If czname is in the GEOID-City file, assine the nearest city. 
# I also add the nearest city's latitude and longitude from GEOID-City 

# Read the GeoID -City file and name it czone 
czone = pd.read_csv('Data/EconomicTracker-main/data/GeoIDs - City.csv')
# Rename cityname from Washington to Wasington DC to match to the czname
czone.loc[czone.cityname =='Washington', 'cityname'] = 'Washington DC'
# Drop unnecessary columns 
czone.drop(['cityid','stateabbrev', 'statename', 'city_pop2019', 'statefips'], axis=1, inplace=True)
# Rename columns 
czone.rename({'lat':'cz_lat', 'lon':'cz_lon', 'cityname':'nearest_city'}, axis=1, inplace=True)


# merge czone and county_zip
df = county_zip.merge(czone, how='left', right_on=['nearest_city'], left_on=['czname'])
df.head()

In [None]:
df.shape

## Because my focus is the housing price movement in urban vs suburb, I drop the zip codes outside of the 52 greater economic zones. 

In [None]:
# Drop zipcode outside of the top 53 greater economic zones. 
df.dropna(axis=0, subset=['nearest_city'], inplace=True)
# Drop missing zipcodes 
df.dropna(axis=0, subset=['ZIP'], inplace=True)

df.drop_duplicates()
df

## Assign latitude and longitude for each zipcode 

Using a data from http://data.nber.org/data/zip-code-distance-database.html, I assign latitude and longitude data for each zipcode. 

In [None]:
# Read latitude and longitude data from http://data.nber.org/data/zip-code-distance-database.html 

url = 'http://data.nber.org/distance/2016/gaz/zcta5/gaz2016zcta5centroid.csv'
lonlat = pd.read_csv(url)

# rename zcta5 (5digit zipcode) to ZIP to match with df
lonlat.rename({'zcta5':'ZIP'}, axis=1, inplace=True)

lonlat.drop_duplicates()
lonlat.info()

In [None]:
# merge lonlat and df 
df_lonlat = df.merge(lonlat, how='left', on='ZIP')

print('The number of zip codes which longitudes and latitudes are missing:', df_lonlat.isnull().sum()[-1:])

In [None]:
df_lonlat.shape

I use longitude and latitudes to calcuate the distance from the city center and assigne urban and suburb areas, which is a core indicator of my analysis and model. So, I drop if a zip code has no longitude/latitude information. 

In [None]:
# Drop zipcodes with no longitudes and latitudes

df_lonlat.dropna(axis=0, subset=['intptlong'], inplace=True)
df_lonlat.info()

In [None]:
df_lonlat.nunique()

## For each zipcode, calcuate a distance to a city center ( a location of city hall) of a nearest city.  
I calculate the distance using the Haversine Formula using longitude and latitude information of two points. 

In [None]:
# Define Haversine Function 

def haversine_vectorize(lon1, lat1, lon2, lat2):

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    newlon = lon2 - lon1
    newlat = lat2 - lat1

    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2

    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km


In [None]:
# Finding the distance between a zipcode and a city center using the Haversine Formula. 


df_lonlat['distance'] = haversine_vectorize(df_lonlat['cz_lon'],df_lonlat['cz_lat'],df_lonlat['intptlong'],
                   df_lonlat['intptlat'])

In [None]:
df_lonlat.sort_values(by='distance')

In [None]:
df_lonlat.drop_duplicates(inplace=True)

# Assigne Central Business District (cbd)
### CBD10= 1 if 'distance' < 10km and  CBD20=1 if 'distance' < 20


In [None]:
# If 'distance' < 10km, CBD10=1.  
df_lonlat['cbd10'] = df_lonlat['distance'] < 10
df_lonlat['cbd10'] = df_lonlat.cbd10.astype('int')
print ( df_lonlat['cbd10'].value_counts()) 

# If 'distance' < 20km, CBD20=1.  
df_lonlat['cbd20'] = df_lonlat['distance'] < 20
df_lonlat['cbd20'] = df_lonlat.cbd20.astype('int')
print ( df_lonlat['cbd20'].value_counts()) 


df_lonlat.head()

In [None]:
set(df_lonlat[df_lonlat.cityname=='New York City']['USPS_ZIP_PREF_CITY'])

### Create a county in a city dummy. 
If a county is in a major city, I assign county_in_city = 1. 

In [None]:
# Create county_in_city  dummy and drop cityid and cityname 

# If a county is in a major city, county_in_city =1 
df_lonlat['county_in_city'] = df_lonlat['cityid'] > 0
df_lonlat['county_in_city'] = df_lonlat.county_in_city.astype('int')
df_lonlat.drop(['cityid', 'cityname'], axis=1, inplace=True)

print ( df_lonlat['county_in_city'].value_counts()) 

df_lonlat.head()

In [None]:
df_lonlat.info()

In [None]:
# Clearn up df_lonlat 

col_drop = ['cz', 'COUNTY','USPS_ZIP_PREF_STATE','cz_lat', 'cz_lon', 'intptlat', 'intptlong' ]

df_geo = df_lonlat.drop(col_drop, axis=1)

df_geo.drop_duplicates(inplace=True)
df_geo.head()


## Now I have df_geo. It links county, state, zip codes, and has distance to the city center. df_geo also has a  dummy of central business district

In [None]:
df_geo.to_csv('Data/df_geo.csv', index = False)

In [None]:
df_geo.czname.unique()

In [None]:
df_geo.shape