In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.geocoders import Nominatim
import time
import requests
import csv
import os

In [None]:
# CDR (Chemical Data Reporting) identifies where chemicals are handled, but not whether or not they are hazardous
# Using this dataset to help identify chemical and manufacturing activities in Georgia (note: only includes manufacturers that handle chemicals)
df_cdr_industrial = pd.read_csv("../../data/raw/scoring_indicators/2020_cdr_industrial_processing_and_use_information.csv", encoding="ISO-8859-1")

# HSI (Hazardous Site Inventory) contains a list of contaminated sites in Georgia that need to be cleaned up
# Use to identify contaminated sites in Georgia (includes landfills, superfund)
df_hazard_sites = pd.read_excel("../../data/raw/scoring_indicators/July-2024-Hazardous-Site-Inventory.xlsx")

# TRI (Toxic Release Inventory) contains how much toxic chemicals are released into the environment
# Use to identify sites in Georgia that release toxic chemicals
df_tri_waste_chemicals = pd.read_csv("../../data/raw/scoring_indicators/waste_hazardous_chemicals.csv")

# Food Access Research Atlas contains information on food access in Georgia
# Use to identify food deserts in Georgia
df_food = pd.read_csv("../../data/raw/scoring_indicators/food_access_research_atlas.csv")

In [None]:
# RCRA (Resource Conservation and Recovery Act) contains information on hazardous waste management
# Use to identify landfills, waste treatment, storage, and disposal facilities in Georgia
df_rcra_fac = pd.read_csv("../../data/raw/scoring_indicators/RCRA_FACILITIES.csv")
df_rcra_enf = pd.read_csv("../../data/raw/scoring_indicators/RCRA_ENFORCEMENTS.csv")
df_rcra_eval = pd.read_csv("../../data/raw/scoring_indicators/RCRA_EVALUATIONS.csv")
df_rcra_naics = pd.read_csv("../../data/raw/scoring_indicators/RCRA_NAICS.csv")
df_rcra_vios = pd.read_csv("../../data/raw/scoring_indicators/RCRA_VIOLATIONS.csv")
df_rcra_vios_hist = pd.read_csv("../../data/raw/scoring_indicators/RCRA_VIOSNC_HISTORY.csv")

In [None]:
# FRS (Facility Registry System) contains information on facilities and their industries 
# Use to further identify chemical and manufacturing activities in Georgia
df_frs_fac = pd.read_csv("../../data/raw/scoring_indicators/FRS_FACILITIES.csv")
df_frs_naics = pd.read_csv("../../data/raw/scoring_indicators/FRS_NAICS_CODES.csv")
df_frs_sic = pd.read_csv("../../data/raw/scoring_indicators/FRS_SIC_CODES.csv")
df_frs_program= pd.read_csv("../../data/raw/scoring_indicators/FRS_PROGRAM_LINKS.csv")


  df_frs_naics = pd.read_csv("../../data/raw/scoring_indicators/FRS_NAICS_CODES.csv")
  df_frs_sic = pd.read_csv("../../data/raw/scoring_indicators/FRS_SIC_CODES.csv")


In [None]:
# Wetlands Data
# Use to help identify excessive wetlands and possible flooding hazards in Georgia
gdf_wetland = gpd.read_file("../../data/raw/gpkg/GA_Wetlands_Geopackage.gpkg", layer="GA_Wetlands")

# Set the CRS to NAD83 / Conus Albers (EPSG:5070) 
gdf_wetland.set_crs(epsg=5070, inplace=True)

### Handling RCRA data

In [None]:
# Filtering to just Georgia locations
df_rcra_fac_ga = df_rcra_fac[df_rcra_fac['ACTIVITY_LOCATION'] == 'GA'].reset_index(drop=True)

In [None]:
# Joining in the NAICs codes to the RCRA facilities
df_rcra_facilities = pd.merge(df_rcra_fac_ga, df_rcra_naics, left_on='ID_NUMBER', right_on='ID_NUMBER', how='left')

In [None]:
# Dropping unnecessary columns
df_rcra_facilities = df_rcra_facilities.drop(['ACTIVITY_LOCATION_x', 'TRANSPORTER', 'ACTIVITY_LOCATION_y'], axis=1)

In [None]:
# Checking for duplicates
print(df_rcra_facilities.duplicated().sum())

67


In [None]:
# Dropping duplicates
df_rcra_facilities = df_rcra_facilities.drop_duplicates()

In [None]:
# Saving data to csv
df_rcra_facilities.to_csv("../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities/rcra_facilities.csv", index=False)

### Handling FRS data

In [None]:
# Filtering to just Georgia locations
df_frs_fac_ga = df_frs_fac[df_frs_fac['FAC_STATE'] == 'GA'].reset_index(drop=True)
df_frs_program_ga = df_frs_program[df_frs_program['STATE_CODE'] == 'GA'].reset_index(drop=True)

In [None]:
# Making sure the REGISTRY_ID is a string for merging
df_frs_fac_ga['REGISTRY_ID_STR'] = df_frs_fac_ga['REGISTRY_ID'].astype(str)
df_frs_naics['REGISTRY_ID_STR'] = df_frs_naics['REGISTRY_ID'].astype(str)
df_frs_sic['REGISTRY_ID_STR'] = df_frs_sic['REGISTRY_ID'].astype(str)

In [None]:
# Merging datasets 
df_frs_facilities = pd.merge(df_frs_fac_ga, df_frs_naics, left_on='REGISTRY_ID_STR', right_on='REGISTRY_ID_STR', how='left')
df_frs_facilities_final = pd.merge(df_frs_facilities, df_frs_sic, left_on='REGISTRY_ID_STR', right_on='REGISTRY_ID_STR', how='left')

In [None]:
# Dropping unmatched columns
df_frs_facilities_final = df_frs_facilities_final.dropna(subset=['REGISTRY_ID_y', 'REGISTRY_ID'], how='all')

In [None]:
# Checking for duplicates
print(df_frs_facilities_final.duplicated().sum())

21715


In [None]:
# Dropping duplicates
df_frs_facilities_final = df_frs_facilities_final.drop_duplicates()

In [None]:
# Dropping unnecessary columns
df_frs_facilities_final = df_frs_facilities_final.drop(['PGM_SYS_ID_x', 'PGM_SYS_ACNRM_x', 'REGISTRY_ID_y', 'PGM_SYS_ID_y', 'PGM_SYS_ACNRM_y', 'REGISTRY_ID'], axis=1)

In [None]:
# Saving data to csv
df_frs_facilities_final.to_csv("../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities/frs_facilities_naics_sic.csv", index=False)

### Handling CDR data

In [37]:
# Filtering to only needed columns 
df_cdr = df_cdr_industrial[['SITE NAME', 'SITE ADDRESS LINE1', 'SITE CITY', 'SITE COUNTY / PARISH', 'SITE POSTAL CODE', 'SITE STATE', 'SITE LATITUDE', 'SITE LONGITUDE', 'SITE NAICS CODE 1']].drop_duplicates()

In [38]:
# Filling in missing latitude and longitude points for df_cdr
geolocator = Nominatim(user_agent="geo_lookup")

# Function to get latitude and longitude
def get_lat_long(row):
    if (row['SITE LATITUDE'] == 0 and row['SITE LONGITUDE'] == 0) or pd.isna(row['SITE LATITUDE']) or pd.isna(row['SITE LONGITUDE']):
        address = f"{row['SITE ADDRESS LINE1']}, {row['SITE CITY']}, {row['SITE STATE']} {row['SITE POSTAL CODE']}, {row['SITE COUNTY / PARISH']}"
        try:
            location = geolocator.geocode(address, timeout=10)
            if location:
                return pd.Series([location.latitude, location.longitude])
        except Exception as e:
            print(f"Error geocoding {address}: {e}")
    return pd.Series([row['SITE LATITUDE'], row['SITE LONGITUDE']])

In [39]:
df_cdr[['SITE LATITUDE', 'SITE LONGITUDE']] = df_cdr.apply(get_lat_long, axis=1)


In [None]:
# Saving data to csv
df_cdr.to_csv("../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities/cdr_industrial_manufacturing_facilities.csv", index=False)

### Handling Wetland data

In [None]:
# Drop rows with empty or null geometries and remove invalid geometries.
gdf_wetland = gdf_wetland[~gdf_wetland['geometry'].is_empty & gdf_wetland['geometry'].notnull()]
gdf_wetland = gdf_wetland[gdf_wetland.is_valid]

# Renaming columns 
gdf_wetland = gdf_wetland.rename(columns={
    'WETLAND_TYPE': 'wetland_type',
    'ACRES': 'acres',
    'NWI_ID': 'wetland_id'
})

# Getting just needed columns 
gdf_wetland = gdf_wetland[['wetland_type', 'acres', 'wetland_id', 'geometry']]

# Saving data 
gdf_wetland.to_file("../../data/preprocessed/scoring_indicators/ga_wetlands_cleaned.gpkg", layer="GA_Wetlands", driver="GPKG")

### Google Maps API 

In [None]:
GOOGLE_API_KEY = ''

In [21]:
UNDESIRABLE_ACTIVITIES = {
    "auto_repair_station": {"google_type": "car_repair"},
    # "commercial_livestock": {"google_type": "farm"},
    "excessive_light": {"google_type": ["casino", "stadium", 'night_club']},
    "excessive_noise": {'google_type': ['airport']}, 
}

In [14]:
# Approximate Georgia box bounds 
LAT_MIN, LAT_MAX = 30.33, 35.00
LON_MIN, LON_MAX = -85.60, -80.75

STEP_LAT = 0.2
STEP_LON = 0.2

SEARCH_RADIUS = 10000

In [15]:
# Function makes a single API call to the Google Places Nearby Search endpoint 

def google_places_nearby_search(lat, lon, place_type, radius_m=SEARCH_RADIUS, page_token=None):
    url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    params = {
        "location": f"{lat},{lon}",
        "radius": radius_m,
        "type": place_type,
        "key": GOOGLE_API_KEY
    }
    if page_token:
        # wait 2s if next_page_token is used
        params["pagetoken"] = page_token

    # parsinbg json response and returning it
    resp = requests.get(url, params=params).json()
    return resp

In [19]:
# Function handles multiple API calls to collect all available results for a grid point, managing pagination if the Google API returns a next_page_token 

def fetch_places_with_pagination(lat, lon, place_type, radius_m=SEARCH_RADIUS):
    """
    Calls google_places_nearby_search repeatedly to handle next_page_token,
    collecting up to 60 results. Returns a list of place dicts.
    """
    all_results = []
    page_token = None
    first_pass = True

    # Loop to repeatedly call google_places_nearby_search for each page (if multiple pages)
    while True:
        if page_token and not first_pass:
            time.sleep(2) 
        else:
            first_pass = False

        resp = google_places_nearby_search(lat, lon, place_type, radius_m, page_token)
        
        # Debugging: see the status field
        status = resp.get("status", "")
        results = resp.get("results", [])
        print(f"status={status}, lat={lat}, lon={lon}, type={place_type}, got {len(results)} results")

        all_results.extend(results)

        page_token = resp.get("next_page_token")
        if not page_token:
            break

    return all_results

In [None]:
# Function that uses steps (0.09° = ~9.9km stepping) + ~5km radius (10km diameter)
# Write results to CSV incrementally after processing each amenity at each grid point.

def rough_bulk_crawl_georgia():
    # Create output directory
    output_dir = "../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities"
    os.makedirs(output_dir, exist_ok=True)
    # File to save results
    csv_file = os.path.join(output_dir, "ga_undesirable_rough.csv")
    # File to track grid boxes and google_types that have been processed
    checkpoint_file = os.path.join(output_dir, "checkpoint_undesirable_activities.csv")

    
    # Set up the CSV file with headers
    fieldnames = [
        "amenity_key", "google_type", "place_id", "name",
        "lat", "lon", "types", "vicinity", "business_status"
    ]

    # Check if the results file already exists and write header only if it doesn't
    if not os.path.exists(csv_file):
        with open(csv_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()

    # Set up (or create) the checkpoint file and load already processed cells.
    processed_checkpoints = set()
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r", newline="", encoding="utf-8") as cp:
            reader = csv.reader(cp)
            header = next(reader, None)
            for row in reader:
                if len(row) >= 3:
                    processed_checkpoints.add((row[0], row[1], row[2]))
    else:
        # If checkpoint file does not exist, create one with a header.
        with open(checkpoint_file, "w", newline="", encoding="utf-8") as cp:
            writer = csv.writer(cp)
            writer.writerow(["lat", "lon", "google_type"])


    
    # Track place_ids we've already processed to avoid duplicates
    processed_place_ids = set()

    
    lat = LAT_MIN
    while lat <= LAT_MAX + 1e-9:
        lon = LON_MIN
        while lon <= LON_MAX + 1e-9:
            
            lat_str = f"{lat:.3f}"
            lon_str = f"{lon:.3f}"
            print(f"Grid point lat={lat:.3f}, lon={lon:.3f}")
            
            # For each amenity in desirable dictionary
            for amenity_key, config in UNDESIRABLE_ACTIVITIES.items():
                gtypes = config.get("google_type", [])
                if isinstance(gtypes, str):
                    gtypes = [gtypes]
                
                # For each google_type, do a search
                for gtype in gtypes:
                    checkpoint_key = (lat_str, lon_str, gtype)
                    if checkpoint_key in processed_checkpoints:
                        print(f"Skipping {checkpoint_key} as it was already processed.")
                        continue
                    try:
                        raw_places = fetch_places_with_pagination(lat, lon, gtype, SEARCH_RADIUS)
                    
                        
                        # Prepare rows for writing
                        rows_to_write = []
                        for p in raw_places:
                            place_id = p.get("place_id", "")
                            
                            # Skip if already processed this place
                            if place_id in processed_place_ids:
                                continue
                                
                            # Add to processed set
                            processed_place_ids.add(place_id)
                            
                            # Create row dict
                            rows_to_write.append({
                                "amenity_key": amenity_key,
                                "google_type": gtype,
                                "place_id": place_id,
                                "name": p.get("name", ""),
                                "lat": p.get("geometry", {}).get("location", {}).get("lat"),
                                "lon": p.get("geometry", {}).get("location", {}).get("lng"),
                                "types": "|".join(p.get("types", [])),
                                "vicinity": p.get("vicinity", ""),
                                "business_status": p.get("business_status", "")
                            })
                        
                        # Append to CSV if we have rows to write
                        if rows_to_write:
                            with open(csv_file, "a", newline="", encoding="utf-8") as f:
                                writer = csv.DictWriter(f, fieldnames=fieldnames)
                                for row in rows_to_write:
                                    writer.writerow(row)
                            
                            print(f"Appended {len(rows_to_write)} places for {amenity_key}/{gtype} at lat={lat_str}, lon={lon_str}")
                        
                         # After processing log the grid cell and google_type to checkpont file
                        with open(checkpoint_file, "a", newline="", encoding="utf-8") as cp:
                            cp_writer = csv.writer(cp)
                            cp_writer.writerow([lat_str, lon_str, gtype])
                        # And add it to the in-memory set
                        processed_checkpoints.add(checkpoint_key)
                                           

                    except Exception as e:
                        print(f"Error fetching lat={lat_str}, lon={lon_str}, type={gtype}: {e}")
            
            lon += STEP_LON
        lat += STEP_LAT
    
    print(f"Finished crawling. Processed {len(processed_place_ids)} unique places total.")
    print(f"Data saved to {csv_file}")

In [22]:
if __name__ == "__main__":
    rough_bulk_crawl_georgia()
    print("Done with rough coverage of Georgia.")

Grid point lat=30.330, lon=-85.600
Skipping ('30.330', '-85.600', 'car_repair') as it was already processed.
Skipping ('30.330', '-85.600', 'casino') as it was already processed.
Skipping ('30.330', '-85.600', 'stadium') as it was already processed.
Skipping ('30.330', '-85.600', 'night_club') as it was already processed.
Skipping ('30.330', '-85.600', 'airport') as it was already processed.
Grid point lat=30.330, lon=-85.400
Skipping ('30.330', '-85.400', 'car_repair') as it was already processed.
Skipping ('30.330', '-85.400', 'casino') as it was already processed.
Skipping ('30.330', '-85.400', 'stadium') as it was already processed.
Skipping ('30.330', '-85.400', 'night_club') as it was already processed.
Skipping ('30.330', '-85.400', 'airport') as it was already processed.
Grid point lat=30.330, lon=-85.200
Skipping ('30.330', '-85.200', 'car_repair') as it was already processed.
Skipping ('30.330', '-85.200', 'casino') as it was already processed.
Skipping ('30.330', '-85.200', 