# Create county information dataset

In [1]:
import pandas as pd
import geopandas as gpd

## Load and clean county shapefile

In [44]:
counties_gdf = gpd.read_file("../../data/source/census/cb_2018_us_county_500k/cb_2018_us_county_500k.shp")

In [45]:
# Set to NAD 1983 Albers North America - https://epsg.io/102008
counties_gdf = counties_gdf.to_crs("ESRI:102008")

In [46]:
# Convert square meter to square miles - https://www.metric-conversions.org/area/square-meters-to-square-miles.htm
counties_gdf["area"] = counties_gdf.area * 0.00000038610215855

In [47]:
counties_gdf.rename(columns={"GEOID":"fips"}, inplace=True)

In [48]:
counties_gdf.sample()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,fips,NAME,LSAD,ALAND,AWATER,geometry,area
2821,19,193,465285,0500000US19193,19193,Woodbury,6,2260790985,12793704,"POLYGON ((-38499.979 302529.582, -35572.074 30...",877.744373


In [49]:
counties_gdf_trim = counties_gdf[["fips","area"]]

In [50]:
counties_gdf_trim.sample()

Unnamed: 0,fips,area
245,45005,412.624001


## Find neighbor counties

### Create the neighbors nested JSON file

In [16]:
neighbors = {}

In [17]:
for county in counties_gdf.itertuples():
    neighbor_filter = counties_gdf[counties_gdf.touches(counties_gdf.geometry)]
    neighbors[counties_gdf.fips] = neighbor_filter["geoid"].tolist()

In [18]:
with open("neighbor-counties.json", "w") as outfile:
    json.dump(neighbors, outfile)

### Create the long neighbors CSV file

In [62]:
# Read the list of neighbors
neighbors = open("../../data/handmade/neighbor-counties.json")
neighbors = json.load(neighbors)

In [63]:
neighbors = pd.DataFrame(pd.concat({k: pd.Series(v) for k, v in neighbors.items()})).reset_index()

In [64]:
neighbors.drop(['level_1'], axis=1, inplace=True)
neighbors.columns = ["fips","neighbor_fips"]

In [65]:
neighbors.sample()

Unnamed: 0,fips,neighbor_fips
0,2013,2016
1,2013,2164
2,2016,2013
3,28107,28071
4,28107,28143


In [66]:
# neighbors.to_csv("../../data/handmade/neighbor-counties.csv", index=False)

## Load and clean state and county FIPS code files

In [51]:
state_fips = pd.read_csv('../../data/source/census/state_fips.txt', sep="|", dtype=str)

In [52]:
state_fips.columns = state_fips.columns.str.lower()

In [53]:
state_fips.sample()

Unnamed: 0,state,statefp,statens,state_name
54,PR,72,1779808,Puerto Rico


In [54]:
county_fips = pd.read_csv('../../data/source/census/county_fips.txt', sep="|", dtype=str)

In [55]:
county_fips.columns = county_fips.columns.str.lower()

In [56]:
# Create county FIPS code
county_fips["fips"] = county_fips["statefp"] + county_fips["countyfp"]

In [57]:
county_fips.sample()

Unnamed: 0,state,statefp,countyfp,countyns,countyname,classfp,funcstat,fips
54,AL,1,109,161581,Pike County,H1,A,1109


In [58]:
# Filter states
county_fips = county_fips.loc[~county_fips["state"].isin(['AS', 'GU', 'MP', 'PR', 'UM', 'VI'])]

In [59]:
county_fips.state.unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)

In [60]:
counties = county_fips[["state","statefp","countyname","countyfp","fips"]].merge(counties_gdf_trim, on="fips", how="left")

In [61]:
counties.sample()

Unnamed: 0,state,statefp,countyname,countyfp,fips,area
0,AL,1,Autauga County,1,1001,604.374128
1,AL,1,Baldwin County,3,1003,1680.528617
2,AL,1,Barbour County,5,1005,904.46141
3,AL,1,Bibb County,7,1007,626.372236
4,AL,1,Blount County,9,1009,650.62003


## Save the county reference file

In [None]:
counties.to_csv("../../data/processed/counties-reference.csv", index=False)