# checking GeoJSON data

In [1]:
# dependencies
import json
import pandas as pd

### Load GeoJSON data

In [2]:
# read in json
with open("../../../Data/geojson-counties-fips.json", "r", encoding="utf-8") as f:
    geo_data = json.load(f)

features = geo_data["features"]

# build a list of dicts for each feature with the fields you care about
rows = []
for feature in features:
    # get fips from id
    fips_id = feature.get("id")  # e.g. "01001"
    
    # pulling extra properties
    props = feature.get("properties", {})
    
    rows.append({
        "fips_geo": fips_id, 
        "GEO_ID": props.get("GEO_ID"),
        "STATE": props.get("STATE"),
        "COUNTY": props.get("COUNTY"),
        "NAME": props.get("NAME")
    })

# convert list of dicts to df
geo_df = pd.DataFrame(rows)

# display
geo_df.head()

Unnamed: 0,fips_geo,GEO_ID,STATE,COUNTY,NAME
0,1001,0500000US01001,1,1,Autauga
1,1009,0500000US01009,1,9,Blount
2,1017,0500000US01017,1,17,Chambers
3,1021,0500000US01021,1,21,Chilton
4,1033,0500000US01033,1,33,Colbert


### Load county fips data

In [4]:
# read in counties fips csv
counties_df = pd.read_csv("../../../Data/fips_data.csv", dtype={"FIPS": str})

# display
counties_df.head()

Unnamed: 0,FIPS,COUNTYNAME,STATE
0,1000,unidentified county,AL
1,1001,Autauga County,AL
2,1003,Baldwin County,AL
3,1005,Barbour County,AL
4,1007,Bibb County,AL


### merge df

In [5]:
# do left merge
merged_df = counties_df.merge(
    geo_df, 
    how="left", 
    left_on="FIPS", 
    right_on="fips_geo",
    indicator=True  # so we can see merge status
)

In [7]:
# find missing geo fips
merged_df.head()

Unnamed: 0,FIPS,COUNTYNAME,STATE_x,fips_geo,GEO_ID,STATE_y,COUNTY,NAME,_merge
0,1000,unidentified county,AL,,,,,,left_only
1,1001,Autauga County,AL,1001.0,0500000US01001,1.0,1.0,Autauga,both
2,1003,Baldwin County,AL,1003.0,0500000US01003,1.0,3.0,Baldwin,both
3,1005,Barbour County,AL,1005.0,0500000US01005,1.0,5.0,Barbour,both
4,1007,Bibb County,AL,1007.0,0500000US01007,1.0,7.0,Bibb,both


In [13]:
# create df of unmatched fips
unmatched_fips_df = merged_df[merged_df['_merge'] == 'left_only']
unmatched_fips_df

Unnamed: 0,FIPS,COUNTYNAME,STATE_x,fips_geo,GEO_ID,STATE_y,COUNTY,NAME,_merge
0,01000,unidentified county,AL,,,,,,left_only
68,02000,unidentified county,AK,,,,,,left_only
74,02063,Chugach Census Area,AK,,,,,,left_only
75,02066,Copper River Census Area,AK,,,,,,left_only
80,02101,Anchorage Municipality to Bird Creek,AK,,,,,,left_only
...,...,...,...,...,...,...,...,...,...
3290,74300,Midway Islands,UM,,,,,,left_only
3291,78000,unidentified county,VI,,,,,,left_only
3292,78010,St. Croix Island,VI,,,,,,left_only
3293,78020,St. John Island,VI,,,,,,left_only


In [15]:
# show unmatched fips
unmatched_fips = unmatched_fips_df['FIPS'].unique()
unmatched_fips

array(['01000', '02000', '02063', '02066', '02101', '02155', '02158',
       '02181', '04000', '05000', '06000', '08000', '09000', '10000',
       '11000', '12000', '13000', '15000', '16000', '17000', '18000',
       '19000', '20000', '21000', '22000', '23000', '24000', '25000',
       '26000', '27000', '28000', '29000', '30000', '31000', '32000',
       '33000', '34000', '35000', '36000', '37000', '38000', '39000',
       '40000', '41000', '42000', '44000', '45000', '46000', '46102',
       '47000', '48000', '49000', '50000', '51000', '53000', '54000',
       '55000', '56000', '60000', '60010', '60020', '60030', '60040',
       '60050', '66000', '66010', '69000', '69085', '69100', '69110',
       '69120', '72000', '74000', '74300', '78000', '78010', '78020',
       '78030'], dtype=object)

In [16]:
# filter for fips that aren't unknown (don't end in 000)
bad_fips = [fip for fip in unmatched_fips if str(fip).endswith('000') == False]
bad_fips

['02063',
 '02066',
 '02101',
 '02155',
 '02158',
 '02181',
 '46102',
 '60010',
 '60020',
 '60030',
 '60040',
 '60050',
 '66010',
 '69085',
 '69100',
 '69110',
 '69120',
 '74300',
 '78010',
 '78020',
 '78030']