In [16]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import geodesic
import googlemaps
import requests
import time


In [17]:
# Import datasets 

# HSI (Hazardous Site Inventory) contains a list of contaminated sites in Georgia that need to be cleaned up
# Use to identify contaminated sites in Georgia (includes landfills, superfund)
df_hsi = pd.read_excel("../../data/raw/scoring_indicators/July-2024-Hazardous-Site-Inventory.xlsx")

# TRI (Toxic Release Inventory) contains how much toxic chemicals are released into the environment
# Use to identify sites in Georgia that release toxic chemicals
df_tri = pd.read_csv("../../data/raw/scoring_indicators/waste_hazardous_chemicals.csv")


# Food Access Research Atlas contains information on food access in Georgia
# Use to identify food deserts in Georgia
df_food_deserts = pd.read_csv("../../data/raw/scoring_indicators/food_access_research_atlas.csv")

df_cdr = pd.read_csv("../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities/cdr_industrial_manufacturing_facilities.csv")

df_frs = pd.read_csv("../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities/frs_facilities_naics_sic.csv")

df_rcra = pd.read_csv("../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities/rcra_facilities.csv")

df_undes_google_places = pd.read_csv("../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities/ga_undesirable_rough.csv")

# gdf_wetland = gpd.read_file("../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities/ga_wetlands_cleaned.gpkg")

  df_frs = pd.read_csv("../../data/preprocessed/scoring_indicators/frs_facilities_naics_sic.csv")


In [19]:
# Create a cleaned and standardized version of df_hsi
df_hsi_clean = df_hsi.copy()
df_hsi_clean['data_source'] = 'hsi'
df_hsi_clean['undesirable_activity'] = 'hazardous_inventory'
df_hsi_clean = df_hsi_clean.rename(columns={
    'Site Name': 'site_name',
    'Address': 'site_address',
    'City': 'site_city',
    'County': 'site_county',
    'Lattitude': 'site_latitude',
    'Longitude': 'site_longitude'
})
df_hsi_final = df_hsi_clean[[
    'data_source', 'site_name', 'site_address', 'site_city', 
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]

In [20]:
# Create a cleaned and standardized version of df_tri
df_tri_clean = df_tri.copy()
df_tri_clean['data_source'] = 'tri'
df_tri_clean['undesirable_activity'] = 'chemical_activity'
df_tri_clean = df_tri_clean.rename(columns={
    'TRI Facility Name': 'site_name',
    'Latitude': 'site_latitude',
    'Longitude': 'site_longitude'
})
df_tri_clean['site_address'] = None  
df_tri_clean['site_city'] = None     
df_tri_clean['site_county'] = None   
df_tri_final = df_tri_clean[[
    'data_source', 'site_name', 'site_address', 'site_city', 
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]

In [21]:
# Filter and clean df_cdr
df_cdr_filtered = df_cdr[df_cdr['SITE NAICS CODE 1'].str[:2].isin(['31', '32', '33'])]
df_cdr_filtered = df_cdr_filtered.copy()
df_cdr_filtered['data_source'] = 'cdr'
df_cdr_filtered['undesirable_activity'] = 'heavy_chemical_manufacturing'
df_cdr_filtered = df_cdr_filtered.rename(columns={
    'SITE NAME': 'site_name',
    'SITE ADDRESS LINE1': 'site_address',
    'SITE CITY': 'site_city',
    'SITE COUNTY / PARISH': 'site_county',
    'SITE LATITUDE': 'site_latitude',
    'SITE LONGITUDE': 'site_longitude'
})
df_cdr_final = df_cdr_filtered[[
    'data_source', 'site_name', 'site_address', 'site_city', 
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]


In [22]:
df_final_v1 = pd.concat([df_hsi_final, df_tri_final, df_cdr_final], ignore_index=True)


In [23]:
# Mapping of letters to undesirable activities
enforcement_map = {
    'L': 'land_disposal',
    'I': 'incinerator',
    'B': 'industrial_furnace',
    'S': 'storage',
    'T': 'treatment',
    'H': 'solid_waste_management'
}

# Function to extract activities from FULL_ENFORCEMENT string
def map_enforcement(enforcement_str):
    return ', '.join([activity for letter, activity in enforcement_map.items() if letter in enforcement_str]) or None

# Apply the function to create the undesirable_activity column
df_rcra['undesirable_activity'] = df_rcra['FULL_ENFORCEMENT'].apply(map_enforcement)

In [24]:
df_rcra_filtered = df_rcra[df_rcra['undesirable_activity'].notna()].copy()

In [25]:
df_rcra_filtered['data_source'] = 'rcra'
df_rcra_filtered = df_rcra_filtered.rename(columns={
    'FACILITY_NAME': 'site_name',
    'STREET_ADDRESS': 'site_address',
    'CITY_NAME': 'site_city',
    'LATITUDE83': 'site_latitude',
    'LONGITUDE83': 'site_longitude'
})
df_rcra_filtered['site_county'] = None

# Reorder columns to match df_final
df_rcra_final = df_rcra_filtered[[
    'data_source', 'site_name', 'site_address', 'site_city',
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]

In [26]:
df_final_v2 = pd.concat([df_final_v1, df_rcra_final], ignore_index=True)

In [27]:
# Filter and clean df_undes_google_places
df_undes_google_places_clean  = df_undes_google_places.copy()
df_undes_google_places_clean = df_undes_google_places_clean[df_undes_google_places_clean['amenity_key'] != 'commercial_livestock']
df_undes_google_places_clean = df_undes_google_places_clean[df_undes_google_places_clean['business_status'].notna()]

df_undes_google_places_clean['data_source'] = 'google_places'
df_undes_google_places_clean['undesirable_activity'] = df_undes_google_places_clean['amenity_key']

df_undes_google_places_clean = df_undes_google_places_clean.rename(columns={
    'name': 'site_name',
    'vicinity': 'site_address',
    'lat': 'site_latitude',
    'lon': 'site_longitude'
})

df_undes_google_places_clean['site_city'] = None
df_undes_google_places_clean['site_county'] = None

df_undes_google_places_final = df_undes_google_places_clean[[
    'data_source', 'site_name', 'site_address', 'site_city', 
    'site_county', 'site_latitude', 'site_longitude', 'undesirable_activity'
]]



In [28]:
df_final = pd.concat([df_final_v2, df_undes_google_places_final], ignore_index=True)

In [30]:
df_final.to_csv("../../data/processed/scoring_indicators/undesirable_hsi_tri_cdr_rcra_google_places.csv", index=False) 