In [54]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import geodesic
import googlemaps
import requests
import time
import csv
import os


In [None]:
GOOGLE_API_KEY = ''

In [56]:
# # Approximate fulton county box bounds 
# LAT_MIN, LAT_MAX = 33.45, 34.19
# LON_MIN, LON_MAX = -84.85, -84.10

# # Approximate dekalb county box bounds 
# LAT_MIN, LAT_MAX = 33.60, 34.00
# LON_MIN, LON_MAX = -84.30, -84.00

# # Approximate gwinette county box bounds 
# LAT_MIN, LAT_MAX = 33.7, 34.10
# LON_MIN, LON_MAX = -84.25, -83.75

# Approximate cobb county box bounds 
# LAT_MIN, LAT_MAX = 33.70, 34.10
# LON_MIN, LON_MAX = -84.75, -84.35


processed_areas = [
    {"lat_min": 33.45, "lat_max": 34.19, "lon_min": -84.85, "lon_max": -84.10},  # Fulton
    {"lat_min": 33.60, "lat_max": 34.00, "lon_min": -84.30, "lon_max": -84.00},  # Dekalb
    {"lat_min": 33.70, "lat_max": 34.10, "lon_min": -84.25, "lon_max": -83.75},  # Gwinnett
    {"lat_min": 33.70, "lat_max": 34.10, "lon_min": -84.75, "lon_max": -84.35},   # Cobb
    {"lat_min": 30.26, "lat_max": 30.60, "lon_min": -85.70, "lon_max": -81.40},  
]

# Approximate Georgia box bounds 
LAT_MIN, LAT_MAX = 30.33, 35.00
LON_MIN, LON_MAX = -85.60, -80.75

STEP_LAT = 0.08
STEP_LON = 0.08

SEARCH_RADIUS = 5000

In [57]:
def is_in_processed_area(lat, lon):
    for area in processed_areas:
        if (area["lat_min"] <= lat <= area["lat_max"]) and (area["lon_min"] <= lon <= area["lon_max"]):
            return True
    return False

In [59]:
# Defining desirable amenities 
# I included custom filters for some amenities based on the QAP definitions 
# The group is the scoring group the desirable activity belongs to 
DESIRABLE_AMENITIES = {
    "national_big_box_store": {
        "group": 1,
        "google_type": ["department_store"],
        "name_contains": ["Walmart", "Target", "Costco", "BJ's", "Sam's Club"]
    },
    "retail_store": {
        "group": 2,
        "google_type": ["clothing_store", "home_goods_store"]
    },
    "grocery_store": {
        "group": 1,
        "google_type": ["grocery_or_supermarket", "supermarket"],
        "type_not_contains": ["convenience_store", "gas_station"]
    },
    "restaurant": {
        "group": 2,
        "google_type": ["restaurant"]
    },
    "hospital": {
        "group": 1,
        "google_type": ["hospital"],
        "name_not_contains": ["Outpatient"]
    },
    "medical_clinic": {
        "group": 1,
        "google_type": ["doctor"],
        "name_contains": ["urgent care", "medical clinic", "immediate care", "physicians", "dentist"]
    },
    "pharmacy": {
        "group": 1,
        "google_type": ["pharmacy"],
        "type_not_contains": ['veterinary_care']
    },
    "technical_college": {
        "group": 2,
        "google_type": ["university"]
    },
    "school": {
        "group": 1,
        "google_type": ["primary_school", "secondary_school"]
    },
    "town_square": {
        "group": 1,
        "google_type": ["city_hall", "courthouse"]
    },
    # "community_center": {
    #     "group": 1,
    #     "google_type": [],
    #     "google_type_or": ["community_center", "gym", "pool"], 
    #     "name_contains_or": ["ymca", "boys & girls club", "public pool", "senior center", "multipurpose facility"]
    # },
    "public_park": {
        "group": 1,
        "google_type": ["park"]
    },
    "library": {
        "group": 1,
        "google_type": ["library"]
    },
    "fire_police_station": {
        "group": 2,
        "google_type": ["fire_station", "police"]
    },
    "bank": {
        "group": 2,
        "google_type": ["bank"]
    },
    "place_of_worship": {
        "group": 2,
        "google_type": ["place_of_worship"]
    },
    "post_office": {
        "group": 2,
        "google_type": ["post_office"]
    },
}

In [60]:
# Function makes a single API call to the Google Places Nearby Search endpoint 

def google_places_nearby_search(lat, lon, place_type, radius_m=SEARCH_RADIUS, page_token=None):
    url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    params = {
        "location": f"{lat},{lon}",
        "radius": radius_m,
        "type": place_type,
        "key": GOOGLE_API_KEY
    }
    if page_token:
        # wait 2s if next_page_token is used
        params["pagetoken"] = page_token

    # parsinbg json response and returning it
    resp = requests.get(url, params=params).json()
    return resp


In [61]:
# Function handles multiple API calls to collect all available results for a grid point, managing pagination if the Google API returns a next_page_token 

def fetch_places_with_pagination(lat, lon, place_type, radius_m=SEARCH_RADIUS):
    """
    Calls google_places_nearby_search repeatedly to handle next_page_token,
    collecting up to 60 results. Returns a list of place dicts.
    """
    all_results = []
    page_token = None
    first_pass = True

    # Loop to repeatedly call google_places_nearby_search for each page (if multiple pages)
    while True:
        if page_token and not first_pass:
            time.sleep(2) 
        else:
            first_pass = False

        resp = google_places_nearby_search(lat, lon, place_type, radius_m, page_token)
        
        # Debugging: see the status field
        status = resp.get("status", "")
        results = resp.get("results", [])
        print(f"status={status}, lat={lat}, lon={lon}, type={place_type}, got {len(results)} results")

        all_results.extend(results)

        page_token = resp.get("next_page_token")
        if not page_token:
            break

    return all_results

In [62]:
# Function that determines whether a place returned by the API should be kept or not based on the filters defined in the DESIRABLE_AMENITIES dictionary 
def passes_filters(place, config, requested_type):
    # place is a dict from Google Places API
    # config is the amenity config with optional 'name_contains', 'name_not_contains', 'type_not_contains' etc.
    # requested_type is the google_type we asked for in the search (e.g., "community_center", "restaurant", etc.)

    place_name = place.get("name", "").lower()
    place_types = place.get("types", [])

    # If there is 'name_not_contains': skip if place_name includes any forbidden substring
    if "name_not_contains" in config:
        for bad_sub in config["name_not_contains"]:
            if bad_sub.lower() in place_name:
                return False

    # If there is 'name_contains': must have at least 1 substring in place_name
    if "name_contains" in config and config["name_contains"]:
        found_any = False
        for sub in config["name_contains"]:
            if sub.lower() in place_name:
                found_any = True
                break
        if not found_any:
            return False

    # If there's 'type_not_contains': skip if any of those appear in place_types
    if "type_not_contains" in config:
        for tnc in config["type_not_contains"]:
            if tnc in place_types:
                return False

    # Specifically for community centers: if
    # if "google_type_or" in config or "name_contains_or" in config:
    #     types_or_list = config.get("google_type_or", [])
    #     names_or_list = config.get("name_contains_or", [])

    #     # condition1: place_types intersects with types_or_list
    #     # overlap => pass
    #     overlap = set(types_or_list).intersection(set(place_types))
    #     cond1 = len(overlap) > 0

    #     # condition2: place_name has a substring from names_or_list
    #     cond2 = any(sub.lower() in place_name for sub in names_or_list)

    #     # we pass if cond1 OR cond2 is True
    #     if not (cond1 or cond2):
    #         return False


    # Confirming the place actually matches the place type
    google_types = config["google_type"]
    overlap = set(google_types).intersection(set(place_types))
    if not overlap:
        return False
    return True

In [None]:
# Function that uses steps (0.08° = ~8.8km stepping) + ~5km radius (10km diameter)
# Write results to CSV incrementally after processing each amenity at each grid point.

def rough_bulk_crawl_georgia():
    # Create output directory
    output_dir = "../../data/preprocessed/scoring_indicators/DesirableUndesirableActivities"
    os.makedirs(output_dir, exist_ok=True)
    # File to save results
    csv_file = os.path.join(output_dir, "ga_desirable_rough.csv")
    # File to track grid boxes and google_types that have been processed
    checkpoint_file = os.path.join(output_dir, "checkpoint_desirable_activities.csv")

    
    # Set up the CSV file with headers
    fieldnames = [
        "amenity_key", "google_type", "place_id", "name",
        "lat", "lon", "types", "vicinity", "business_status"
    ]

    # Check if the results file already exists and write header only if it doesn't
    if not os.path.exists(csv_file):
        with open(csv_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()

    # Set up (or create) the checkpoint file and load already processed cells.
    processed_checkpoints = set()
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r", newline="", encoding="utf-8") as cp:
            reader = csv.reader(cp)
            header = next(reader, None)
            for row in reader:
                if len(row) >= 3:
                    processed_checkpoints.add((row[0], row[1], row[2]))
    else:
        # If checkpoint file does not exist, create one with a header.
        with open(checkpoint_file, "w", newline="", encoding="utf-8") as cp:
            writer = csv.writer(cp)
            writer.writerow(["lat", "lon", "google_type"])


    
    # Track place_ids we've already processed to avoid duplicates
    processed_place_ids = set()

    
    lat = LAT_MIN
    while lat <= LAT_MAX + 1e-9:
        lon = LON_MIN
        while lon <= LON_MAX + 1e-9:
            if is_in_processed_area(lat, lon):
                # Skip processing this grid cell
                lon += STEP_LON
                continue
            
            lat_str = f"{lat:.3f}"
            lon_str = f"{lon:.3f}"
            print(f"Grid point lat={lat:.3f}, lon={lon:.3f}")
            
            # For each amenity in desirable dictionary
            for amenity_key, config in DESIRABLE_AMENITIES.items():
                gtypes = config.get("google_type", [])
                if isinstance(gtypes, str):
                    gtypes = [gtypes]
                
                # For each google_type, do a search
                for gtype in gtypes:
                    checkpoint_key = (lat_str, lon_str, gtype)
                    if checkpoint_key in processed_checkpoints:
                        print(f"Skipping {checkpoint_key} as it was already processed.")
                        continue
                    try:
                        raw_places = fetch_places_with_pagination(lat, lon, gtype, SEARCH_RADIUS)
                        
                        # Filter the places
                        filtered_places = []
                        for p in raw_places:
                            if passes_filters(p, config, gtype):
                                filtered_places.append(p)
                        
                        # Prepare rows for writing
                        rows_to_write = []
                        for p in filtered_places:
                            place_id = p.get("place_id", "")
                            
                            # Skip if already processed this place
                            if place_id in processed_place_ids:
                                continue
                                
                            # Add to processed set
                            processed_place_ids.add(place_id)
                            
                            # Create row dict
                            rows_to_write.append({
                                "amenity_key": amenity_key,
                                "google_type": gtype,
                                "place_id": place_id,
                                "name": p.get("name", ""),
                                "lat": p.get("geometry", {}).get("location", {}).get("lat"),
                                "lon": p.get("geometry", {}).get("location", {}).get("lng"),
                                "types": "|".join(p.get("types", [])),
                                "vicinity": p.get("vicinity", ""),
                                "business_status": p.get("business_status", "")
                            })
                        
                        # Append to CSV if we have rows to write
                        if rows_to_write:
                            with open(csv_file, "a", newline="", encoding="utf-8") as f:
                                writer = csv.DictWriter(f, fieldnames=fieldnames)
                                for row in rows_to_write:
                                    writer.writerow(row)
                            
                            print(f"Appended {len(rows_to_write)} places for {amenity_key}/{gtype} at lat={lat_str}, lon={lon_str}")
                        
                         # After processing log the grid cell and google_type to checkpont file
                        with open(checkpoint_file, "a", newline="", encoding="utf-8") as cp:
                            cp_writer = csv.writer(cp)
                            cp_writer.writerow([lat_str, lon_str, gtype])
                        # And add it to the in-memory set
                        processed_checkpoints.add(checkpoint_key)
                                           


                    except Exception as e:
                        print(f"Error fetching lat={lat_str}, lon={lon_str}, type={gtype}: {e}")
            
            lon += STEP_LON
        lat += STEP_LAT
    
    print(f"Finished crawling. Processed {len(processed_place_ids)} unique places total.")
    print(f"Data saved to {csv_file}")


In [70]:
if __name__ == "__main__":
    rough_bulk_crawl_georgia()
    print("Done with rough coverage of Georgia.")

Grid point lat=30.330, lon=-81.360
Skipping ('30.330', '-81.360', 'department_store') as it was already processed.
Skipping ('30.330', '-81.360', 'clothing_store') as it was already processed.
Skipping ('30.330', '-81.360', 'home_goods_store') as it was already processed.
Skipping ('30.330', '-81.360', 'grocery_or_supermarket') as it was already processed.
Skipping ('30.330', '-81.360', 'supermarket') as it was already processed.
Skipping ('30.330', '-81.360', 'restaurant') as it was already processed.
Skipping ('30.330', '-81.360', 'hospital') as it was already processed.
Skipping ('30.330', '-81.360', 'doctor') as it was already processed.
Skipping ('30.330', '-81.360', 'pharmacy') as it was already processed.
Skipping ('30.330', '-81.360', 'university') as it was already processed.
Skipping ('30.330', '-81.360', 'primary_school') as it was already processed.
Skipping ('30.330', '-81.360', 'secondary_school') as it was already processed.
Skipping ('30.330', '-81.360', 'city_hall') as