### Poultry Processing Plant Geocoding

This notebook demonstrates the first step of a data pipeline in which the locations of poultry processing sites are geocoded from street addresses into latitude/longitude coordinates. The result is then saved to a CSV file.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd

from utils.geocode import geocode_address

In [3]:
RAW_DATA_PATH = "../data/raw/"
PROCESSED_DATA_PATH = "../data/processed/"
DATA_PATH = RAW_DATA_PATH + "MPI_Directory_by_Establishment_Number.xlsx"
EXPORT_PATH = PROCESSED_DATA_PATH + "fsis_geocoded.csv"

In [4]:
df = pd.read_excel(DATA_PATH)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6919 entries, 0 to 6918
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   EstNumber            6919 non-null   object        
 1   Establishment ID     6919 non-null   int64         
 2   Company              6919 non-null   object        
 3   Street               6919 non-null   object        
 4   City                 6919 non-null   object        
 5   State                6919 non-null   object        
 6   Zip                  6919 non-null   int64         
 7   Phone                6058 non-null   object        
 8   GrantDate            6919 non-null   object        
 9   Activities           6919 non-null   object        
 10  DBAs                 2162 non-null   object        
 11  LatestMPIActiveDate  6919 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 648.8+ KB


  warn("""Cannot parse header or footer so it will be ignored""")


In [5]:
geocoded_rows = []
# loop only does five rows as an example
for idx, row in df[:5].iterrows():
    lat, lng = geocode_address(row.Street, row.City, row.State, str(row.Zip))
    row['lat'] = lat
    row['lng'] = lng
    geocoded_rows.append(row)

In [6]:
df_geocoded = pd.DataFrame(geocoded_rows)
df_geocoded.head()

Unnamed: 0,EstNumber,Establishment ID,Company,Street,City,State,Zip,Phone,GrantDate,Activities,DBAs,LatestMPIActiveDate,lat,lng
0,G1028,195,Papetti's Hygrade Egg Products Inc.,One Papetti Plaza,Elizabeth,NJ,7206,(908) 282-7900,12/05/2019,Egg Product,,2023-09-18,40.657042,-74.190242
1,G1105,126734,"American Egg Products, LLC",375 Pierce Industrial Blvd.,Blackshear,GA,31516,(912) 449-5700,09/22/2021,Egg Product,,2023-09-18,31.304123,-82.232523
2,G1126A,126639,Shepherds Processed Eggs,3502 West 6400 South,Spanish Fork,UT,84660,(801) 798-2593,01/28/2022,Egg Product,,2023-09-18,40.115087,-111.737468
3,G1126B,6164012,Shepherds Processed Eggs,3465 West 6400 South,Spanish Fork,UT,84660,,01/28/2022,Egg Product,,2023-09-18,40.11415,-111.737268
4,G1141,8204,"Wabash Valley Produce, Inc.",501 S. Chestnut,Farina,IL,62838,,03/04/2022,Egg Product,,2023-09-18,38.836207,-88.769368


In [7]:
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
df_geocoded.to_csv(EXPORT_PATH, index=False)