# Hurricane Social Media Analysis Pipeline
## Complete Self-Contained Notebook for ArcGIS Pro

This notebook contains all code needed to process hurricane-related tweets and generate time-sliced raster datasets.

**Features:**
- Multi-level geographic matching (STATE/COUNTY/CITY/FACILITY)
- Kernel Density Estimation (KDE) for city points
- Hierarchical weighted rasterization
- Time-binned incremental and cumulative outputs
- Full GeoTIFF export for ArcGIS Pro visualization

**Requirements:**
- geopandas
- rasterio
- fuzzywuzzy
- scipy
- numpy
- pandas

---
## Cell 1: Configuration

Modify these paths and parameters as needed:

In [3]:
import os
from typing import Dict

# Base Path Discovery
# Update this to your project root if needed
LOCAL_PATH = r"C:\users\colto\documents\github\tweet_project"

# Core Data Directories
DATA_DIR = os.path.join(LOCAL_PATH, "data")
GEOJSON_DIR = os.path.join(DATA_DIR, "geojson")
SHAPE_FILES_DIR = os.path.join(DATA_DIR, "shape_files")
OUTPUT_DIR = os.path.join(LOCAL_PATH, "rasters_output")

# Input File Paths
FRANCINE_PATH = os.path.join(GEOJSON_DIR, "francine.geojson")
HELENE_PATH = os.path.join(GEOJSON_DIR, "helene.geojson")
STATES_PATH = os.path.join(SHAPE_FILES_DIR, "cb_2023_us_state_20m.shp")
COUNTIES_PATH = os.path.join(SHAPE_FILES_DIR, "cb_2023_us_county_20m.shp")
CITIES_PATH = os.path.join(SHAPE_FILES_DIR, "US_Cities.shp")

# Raster/Projection Settings
TARGET_CRS = "EPSG:3857"  # Web Mercator
CELL_SIZE_M = 1000  # 1km cells

# Hierarchical Weights (relative influence by admin level)
WEIGHTS: Dict[str, int] = {
    "STATE": 2,
    "COUNTY": 5,
    "CITY": 10,
    "FACILITY": 10,
}

# Fuzzy Matching & Time Binning
FUZZY_THRESHOLD = 75  # Global matching threshold (0-100)
FUZZY_THRESHOLD_CONTEXTUAL = 70  # Contextual (within-state) threshold
TIME_BIN_HOURS = 4  # Temporal bin width

print("Configuration loaded successfully")
print(f"Project root: {LOCAL_PATH}")
print(f"Output directory: {OUTPUT_DIR}")

Configuration loaded successfully
Project root: C:\users\colto\documents\github\tweet_project
Output directory: C:\users\colto\documents\github\tweet_project\rasters_output


---
## Cell 2: Import Dependencies

In [4]:
import warnings
import re
from typing import Any, Dict, Iterable, List, Optional, Tuple
from datetime import datetime, timezone

import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio.transform import from_bounds
from rasterio.features import rasterize
from scipy.ndimage import gaussian_filter
from fuzzywuzzy import fuzz, process

warnings.filterwarnings('ignore')

print("All dependencies imported successfully")

<class 'ModuleNotFoundError'>: No module named 'geopandas'

---
## Cell 3: Data Loading Functions

In [None]:
def load_hurricane_data() -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
    """
    Load hurricane GeoJSON files and standardize timestamps, time bins, and labels.
    """
    print("=" * 60)
    print("LOADING HURRICANE DATA")
    print("=" * 60)

    print(f"\nLoading Francine data from: {FRANCINE_PATH}")
    francine_gdf = gpd.read_file(FRANCINE_PATH)

    print(f"Loading Helene data from: {HELENE_PATH}")
    helene_gdf = gpd.read_file(HELENE_PATH)

    print("\nStandardizing timestamps to UTC...")
    francine_gdf["timestamp"] = pd.to_datetime(francine_gdf["time"], utc=True)
    helene_gdf["timestamp"] = pd.to_datetime(helene_gdf["time"], utc=True)

    time_bin_str = f"{TIME_BIN_HOURS}h"
    print(f"Grouping data into {TIME_BIN_HOURS}-hour bins...")
    francine_gdf["time_bin"] = francine_gdf["timestamp"].dt.floor(time_bin_str)
    helene_gdf["time_bin"] = helene_gdf["timestamp"].dt.floor(time_bin_str)

    francine_gdf["unix_timestamp"] = francine_gdf["time_bin"].astype("int64") // 1000
    helene_gdf["unix_timestamp"] = helene_gdf["time_bin"].astype("int64") // 1000

    francine_gdf["bin_label"] = francine_gdf["time_bin"].dt.strftime("%Y%m%d_%H%M")
    helene_gdf["bin_label"] = helene_gdf["time_bin"].dt.strftime("%Y%m%d_%H%M")

    print(f"\nLoaded {len(francine_gdf)} Francine tweets")
    print(f"Loaded {len(helene_gdf)} Helene tweets")

    return francine_gdf, helene_gdf


def load_reference_shapefiles() -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame, gpd.GeoDataFrame]:
    """
    Load reference shapefiles for states, counties, and cities.
    """
    print("\nLoading reference shapefiles...")

    states_gdf = gpd.read_file(STATES_PATH)
    print(f"  Loaded {len(states_gdf)} states")

    counties_gdf = gpd.read_file(COUNTIES_PATH)
    print(f"  Loaded {len(counties_gdf)} counties")

    cities_gdf = gpd.read_file(CITIES_PATH)
    print(f"  Loaded {len(cities_gdf)} cities")

    return states_gdf, counties_gdf, cities_gdf


def create_timestamp_dictionaries(
    francine_gdf: gpd.GeoDataFrame, helene_gdf: gpd.GeoDataFrame
) -> Tuple[Dict[int, pd.Timestamp], Dict[int, pd.Timestamp]]:
    """
    Create lookup dictionaries mapping unix_timestamp -> time_bin.
    """
    francine_dict = dict(zip(francine_gdf["unix_timestamp"], francine_gdf["time_bin"]))
    helene_dict = dict(zip(helene_gdf["unix_timestamp"], helene_gdf["time_bin"]))
    return francine_dict, helene_dict


def get_time_bins(gdf: gpd.GeoDataFrame) -> List[int]:
    """
    Get sorted unique unix_timestamp bins from a GeoDataFrame.
    """
    return sorted(gdf["unix_timestamp"].unique())


print("Data loading functions defined")

---
## Cell 4: Geographic Matching Functions

In [None]:
def preprocess_place_name(name: Any) -> Optional[str]:
    """
    Standardize place names for better matching.
    """
    if pd.isna(name) or name == "NAN":
        return None

    name = str(name).upper().strip()

    # Abbreviation expansions
    name = re.sub(r"\bST\.?\b", "SAINT", name)
    name = re.sub(r"\bMT\.?\b", "MOUNT", name)
    name = re.sub(r"\bFT\.?\b", "FORT", name)
    name = re.sub(r"\bN\.?\b", "NORTH", name)
    name = re.sub(r"\bS\.?\b", "SOUTH", name)
    name = re.sub(r"\bE\.?\b", "EAST", name)
    name = re.sub(r"\bW\.?\b", "WEST", name)

    # Remove punctuation; collapse whitespace
    name = re.sub(r"[^\w\s]", "", name)
    name = re.sub(r"\s+", " ", name)

    return name.strip()


def parse_gpe_entities(gpe_string: Any) -> List[Optional[str]]:
    """
    Parse a free-text GPE field into a list of cleaned candidate entities.
    """
    if not gpe_string or pd.isna(gpe_string) or str(gpe_string).strip() == "":
        return []

    gpe_string = str(gpe_string).strip()
    entities: List[Optional[str]] = []

    parts = [part.strip() for part in gpe_string.split(",")]

    for part in parts:
        if part:
            sub_parts = re.split(r"[;&|]", part)
            for sub_part in sub_parts:
                sub_part = sub_part.strip()
                if sub_part and len(sub_part) > 1:
                    entities.append(preprocess_place_name(sub_part))

    clean_entities: List[str] = []
    seen: set[str] = set()
    for entity in entities:
        if entity and entity not in seen:
            clean_entities.append(entity)
            seen.add(entity)

    return clean_entities


def create_hierarchical_lookups(
    states_gdf: gpd.GeoDataFrame,
    counties_gdf: gpd.GeoDataFrame,
    cities_gdf: gpd.GeoDataFrame,
) -> Dict[str, Any]:
    """
    Create hierarchical lookup dictionaries for fuzzy matching.
    """
    print("\nCreating hierarchical lookup dictionaries...")

    # States
    state_lookup: Dict[str, Any] = {}
    state_abbrev_to_name: Dict[str, str] = {}
    state_name_to_abbrev: Dict[str, str] = {}

    for _, row in states_gdf.iterrows():
        state_name = preprocess_place_name(row["NAME"])
        if state_name:
            state_lookup[state_name] = row.geometry
            if "STUSPS" in row:
                abbrev = str(row["STUSPS"]).upper()
                state_abbrev_to_name[abbrev] = state_name
                state_name_to_abbrev[state_name] = abbrev
                state_lookup[abbrev] = row.geometry

    # Counties
    county_by_state: Dict[str, Dict[str, Any]] = {}
    county_lookup: Dict[str, Any] = {}

    for _, row in counties_gdf.iterrows():
        county_name = preprocess_place_name(row["NAME"])
        state_fips = row.get("STATEFP", "")

        if county_name:
            county_lookup[county_name] = row.geometry

            state_name = None
            if "STATE_NAME" in row:
                state_name = preprocess_place_name(row["STATE_NAME"])
            else:
                for _, s_row in states_gdf.iterrows():
                    if s_row.get("STATEFP", "") == state_fips:
                        state_name = preprocess_place_name(s_row["NAME"])
                        break

            if state_name:
                county_by_state.setdefault(state_name, {})
                county_by_state[state_name][county_name] = row.geometry

    # Cities (as points for KDE)
    city_by_state: Dict[str, Dict[str, Any]] = {}
    city_lookup: Dict[str, Any] = {}

    for _, row in cities_gdf.iterrows():
        city_name = preprocess_place_name(row["NAME"])
        state_abbrev = str(row.get("ST", "")).upper()

        if city_name:
            if hasattr(row.geometry, 'centroid'):
                city_lookup[city_name] = row.geometry.centroid
            else:
                city_lookup[city_name] = row.geometry

            if state_abbrev in state_abbrev_to_name:
                state_full = state_abbrev_to_name[state_abbrev]
                city_by_state.setdefault(state_full, {})
                if hasattr(row.geometry, 'centroid'):
                    city_by_state[state_full][city_name] = row.geometry.centroid
                else:
                    city_by_state[state_full][city_name] = row.geometry

    return {
        "state_lookup": state_lookup,
        "county_lookup": county_lookup,
        "city_lookup": city_lookup,
        "county_by_state": county_by_state,
        "city_by_state": city_by_state,
        "state_abbrev_to_name": state_abbrev_to_name,
        "state_name_to_abbrev": state_name_to_abbrev,
    }


def fuzzy_match_entity(
    entity: Optional[str],
    candidates: Dict[str, Any],
    threshold: int = FUZZY_THRESHOLD,
) -> Tuple[Optional[str], int]:
    """
    Fuzzy match an entity against candidate names.
    """
    if not entity or not candidates:
        return None, 0

    if entity in candidates:
        return entity, 100

    match = process.extractOne(entity, candidates.keys(), scorer=fuzz.ratio)
    if match and match[1] >= threshold:
        return match[0], match[1]

    return None, 0


print("Geographic matching functions defined (1/3)")

---
## Cell 5: Geographic Matching Functions (continued)

In [None]:
def find_all_geographic_matches(
    entities: List[str],
    lookups: Dict[str, Any],
) -> List[Tuple[str, Optional[str], Any, int]]:
    """
    Find ALL geographic matches (STATE, COUNTY, CITY) for the given entities.
    """
    if not entities:
        return []

    state_lookup = lookups["state_lookup"]
    county_lookup = lookups["county_lookup"]
    city_lookup = lookups["city_lookup"]
    county_by_state = lookups["county_by_state"]
    city_by_state = lookups["city_by_state"]

    all_matches: List[Tuple[str, Optional[str], Any, int]] = []
    found_states: set[str] = set()

    # STATES
    for entity in entities:
        state_match, state_score = fuzzy_match_entity(entity, state_lookup, threshold=75)
        if state_match:
            all_matches.append(("STATE", state_match, state_lookup[state_match], state_score))
            found_states.add(state_match)

    # COUNTIES
    for entity in entities:
        county_match, county_score = fuzzy_match_entity(entity, county_lookup, threshold=75)
        if county_match:
            all_matches.append(("COUNTY", county_match, county_lookup[county_match], county_score))

        for state_name in found_states:
            if state_name in county_by_state:
                state_counties = county_by_state[state_name]
                state_county_match, state_county_score = fuzzy_match_entity(
                    entity, state_counties, threshold=70
                )
                if state_county_match and state_county_score > county_score:
                    all_matches = [
                        m for m in all_matches
                        if not (m[0] == "COUNTY" and m[1] == county_match)
                    ]
                    all_matches.append(
                        ("COUNTY", state_county_match, state_counties[state_county_match], state_county_score)
                    )

    # CITIES
    for entity in entities:
        city_match, city_score = fuzzy_match_entity(entity, city_lookup, threshold=75)
        if city_match:
            all_matches.append(("CITY", city_match, city_lookup[city_match], city_score))

        for state_name in found_states:
            if state_name in city_by_state:
                state_cities = city_by_state[state_name]
                state_city_match, state_city_score = fuzzy_match_entity(
                    entity, state_cities, threshold=70
                )
                if state_city_match and state_city_score > city_score:
                    all_matches = [
                        m for m in all_matches
                        if not (m[0] == "CITY" and m[1] == city_match)
                    ]
                    all_matches.append(
                        ("CITY", state_city_match, state_cities[state_city_match], state_city_score)
                    )

    # De-duplicate
    unique_matches: List[Tuple[str, Optional[str], Any, int]] = []
    seen_combinations: set[Tuple[str, Optional[str]]] = set()
    for match in all_matches:
        combo = (match[0], match[1])
        if combo not in seen_combinations:
            unique_matches.append(match)
            seen_combinations.add(combo)

    return unique_matches


def multi_level_assign_scale_levels(
    row: pd.Series,
    lookups: Dict[str, Any],
) -> List[Tuple[str, Optional[str], Any, int]]:
    """
    Return ALL geographic scale levels that match this tweet.
    """
    gpe = str(row.get("GPE", "")).strip()
    fac = str(row.get("FAC", "")).strip()

    matches: List[Tuple[str, Optional[str], Any, int]] = []

    entities = parse_gpe_entities(gpe)
    if entities:
        geo_matches = find_all_geographic_matches(entities, lookups)
        matches.extend(geo_matches)

    if fac and fac not in ["nan", "NAN", ""]:
        matches.append(("FACILITY", fac, row.geometry, 100))

    if not matches:
        matches.append(("UNMATCHED", None, row.geometry, 0))

    return matches


def expand_tweets_by_matches(
    gdf: gpd.GeoDataFrame,
    lookups: Dict[str, Any],
    dataset_name: str,
) -> gpd.GeoDataFrame:
    """
    Expand the GeoDataFrame so each tweet creates multiple rows (one per match).
    """
    print(f"\nExpanding {dataset_name} tweets by geographic matches...")

    expanded_rows: List[pd.Series] = []

    for idx, row in gdf.iterrows():
        if idx % 100 == 0:
            print(f"  Processing tweet {idx}...")

        matches = multi_level_assign_scale_levels(row, lookups)

        for scale, name, geom, score in matches:
            new_row = row.copy()
            new_row["scale_level"] = scale
            new_row["matched_name"] = name
            new_row["matched_geom"] = geom
            new_row["match_score"] = score
            new_row["original_index"] = idx
            expanded_rows.append(new_row)

    expanded_gdf = gpd.GeoDataFrame(expanded_rows, crs=gdf.crs)

    print(f"  Expanded from {len(gdf)} to {len(expanded_gdf)} rows")

    scale_counts = expanded_gdf["scale_level"].value_counts()
    print(f"\n  Scale level distribution:")
    for scale, count in scale_counts.items():
        print(f"    {scale}: {count}")

    return expanded_gdf


def create_interval_counts(gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    """
    Group tweets by time intervals and scale level for aggregation.
    """
    grouped = gdf.groupby(["unix_timestamp", "scale_level", "matched_name"])
    interval_counts = grouped.agg({"matched_geom": "first"}).reset_index()

    count_series = grouped.size()
    interval_counts["count"] = count_series.values

    interval_counts = interval_counts.sort_values("unix_timestamp")

    interval_counts["cumulative_count"] = (
        interval_counts.groupby(["scale_level", "matched_name"])["count"].cumsum()
    )

    return interval_counts


print("Geographic matching functions defined (2/3)")

---
## Cell 6: Rasterization Functions (Part 1)

In [None]:
def create_master_grid(
    francine_gdf: gpd.GeoDataFrame,
    helene_gdf: gpd.GeoDataFrame,
    states_gdf: gpd.GeoDataFrame,
    counties_gdf: gpd.GeoDataFrame,
    cities_gdf: gpd.GeoDataFrame,
) -> Dict[str, Any]:
    """
    Create the master grid canvas and projected geometry lookups.
    """
    print("=" * 60)
    print("CREATING MASTER GRID CANVAS")
    print("=" * 60)

    print(f"\nProjecting datasets to {TARGET_CRS}...")
    francine_proj = francine_gdf.to_crs(TARGET_CRS)
    helene_proj = helene_gdf.to_crs(TARGET_CRS)

    states_proj = states_gdf.to_crs(TARGET_CRS)
    counties_proj = counties_gdf.to_crs(TARGET_CRS)
    cities_proj = cities_gdf.to_crs(TARGET_CRS)

    print("\nCalculating master extent...")
    francine_bounds = francine_proj.total_bounds
    helene_bounds = helene_proj.total_bounds

    minx = min(francine_bounds[0], helene_bounds[0])
    miny = min(francine_bounds[1], helene_bounds[1])
    maxx = max(francine_bounds[2], helene_bounds[2])
    maxy = max(francine_bounds[3], helene_bounds[3])

    width = int(np.ceil((maxx - minx) / CELL_SIZE_M))
    height = int(np.ceil((maxy - miny) / CELL_SIZE_M))

    print(f"\nGrid Configuration:")
    print(f"  Cell size: {CELL_SIZE_M:,} meters")
    print(f"  Grid dimensions: {width} x {height} cells")
    print(f"  Total cells: {width * height:,}")

    master_transform = from_bounds(minx, miny, maxx, maxy, width, height)

    area_km2 = (width * height * CELL_SIZE_M * CELL_SIZE_M) / 1_000_000
    print(f"\nCoverage area: {area_km2:,.2f} kmÂ²")

    print("\nCreating projected geometry lookups...")
    state_lookup_proj = dict(zip(states_proj["NAME"].str.upper(), states_proj.geometry))
    county_lookup_proj = dict(zip(counties_proj["NAME"].str.upper(), counties_proj.geometry))
    cities_lookup_proj = dict(zip(cities_proj["NAME"].str.upper(), cities_proj.geometry.centroid))

    grid_params: Dict[str, Any] = {
        "crs": TARGET_CRS,
        "cell_size": CELL_SIZE_M,
        "width": width,
        "height": height,
        "bounds": (minx, miny, maxx, maxy),
        "transform": master_transform,
        "state_lookup_proj": state_lookup_proj,
        "county_lookup_proj": county_lookup_proj,
        "cities_lookup_proj": cities_lookup_proj,
        "francine_proj": francine_proj,
        "helene_proj": helene_proj,
    }

    print("\nMaster grid canvas ready [OK]")
    return grid_params


def create_facility_raster(
    data: gpd.GeoDataFrame,
    grid_params: Dict[str, Any],
) -> np.ndarray:
    """
    Create a Gaussian kernel density-like raster for facility points.
    """
    facility_grid = np.zeros((grid_params["height"], grid_params["width"]), dtype=np.float32)
    facility_data = data[data["scale_level"] == "FACILITY"]

    if len(facility_data) == 0:
        return facility_grid

    facility_counts = facility_data.groupby("matched_name")["count"].sum()

    sigma_meters = 2 * grid_params["cell_size"]
    sigma_pixels = sigma_meters / grid_params["cell_size"]
    facility_multiplier = WEIGHTS["FACILITY"]

    facilities_processed = 0
    for facility_name, tweet_count in facility_counts.items():
        facility_rows = facility_data[facility_data["matched_name"] == facility_name]
        if len(facility_rows) == 0:
            continue

        facility_point = facility_rows.iloc[0]["matched_geom"]
        if hasattr(facility_point, "x") and hasattr(facility_point, "y"):
            point_geoseries = gpd.GeoSeries([facility_point], crs="EPSG:4326")
            point_proj = point_geoseries.to_crs(grid_params["crs"]).iloc[0]

            px = (point_proj.x - grid_params["bounds"][0]) / grid_params["cell_size"]
            py = (grid_params["bounds"][3] - point_proj.y) / grid_params["cell_size"]

            if 0 <= px < grid_params["width"] and 0 <= py < grid_params["height"]:
                point_grid = np.zeros((grid_params["height"], grid_params["width"]), dtype=np.float32)
                point_grid[int(py), int(px)] = float(tweet_count)

                kernel_grid = gaussian_filter(point_grid, sigma=sigma_pixels, mode="constant", cval=0.0)
                facility_grid += kernel_grid * facility_multiplier
                facilities_processed += 1

    return facility_grid


def create_city_kde_raster(
    city_data: pd.DataFrame,
    cities_lookup_proj: Dict[str, Any],
    grid_params: Dict[str, Any]
) -> np.ndarray:
    """
    Create a raster layer for cities using Kernel Density Estimation (KDE).
    """
    city_grid = np.zeros((grid_params["height"], grid_params["width"]), dtype=np.float32)

    city_counts = city_data.groupby("matched_name")["count"].sum()

    if len(city_counts) == 0:
        return city_grid

    sigma_meters = 3 * grid_params["cell_size"]
    sigma_pixels = sigma_meters / grid_params["cell_size"]

    print(f"      Creating city KDE raster with sigma={sigma_pixels:.2f} pixels...")

    cities_processed = 0

    for city_name, tweet_count in city_counts.items():
        if city_name not in cities_lookup_proj:
            continue

        city_point = cities_lookup_proj[city_name]

        if not hasattr(city_point, 'x') or not hasattr(city_point, 'y'):
            continue

        px = (city_point.x - grid_params["bounds"][0]) / grid_params["cell_size"]
        py = (grid_params["bounds"][3] - city_point.y) / grid_params["cell_size"]

        if 0 <= px < grid_params["width"] and 0 <= py < grid_params["height"]:
            point_grid = np.zeros((grid_params["height"], grid_params["width"]), dtype=np.float32)
            point_grid[int(py), int(px)] = tweet_count

            kernel_grid = gaussian_filter(point_grid, sigma=sigma_pixels, mode='constant', cval=0)

            city_grid += kernel_grid * np.log1p(float(tweet_count)) * WEIGHTS["CITY"]

            cities_processed += 1

    print(f"        Processed {cities_processed}/{len(city_counts)} cities")
    if np.max(city_grid) > 0:
        print(f"        City KDE max: {np.max(city_grid):.2f}")

    return city_grid


print("Rasterization functions defined (1/2)")

---
## Cell 7: Rasterization Functions (Part 2)

In [None]:
def create_hierarchical_rasters(
    data: gpd.GeoDataFrame,
    grid_params: Dict[str, Any],
    time_bin: int,
) -> np.ndarray:
    """
    Create hierarchically weighted rasters with automatic parent state inclusion.
    """
    output_grid = np.zeros((grid_params["height"], grid_params["width"]), dtype=np.float32)
    states_to_include: set[str] = set()

    state_lookup_proj = grid_params["state_lookup_proj"]
    county_lookup_proj = grid_params["county_lookup_proj"]
    cities_lookup_proj = grid_params["cities_lookup_proj"]

    # Identify states
    state_data = data[data["scale_level"] == "STATE"]
    if len(state_data) > 0:
        states_to_include.update(state_data["matched_name"].unique())

    county_data = data[data["scale_level"] == "COUNTY"]
    for county_name in county_data["matched_name"].unique():
        if county_name in county_lookup_proj:
            county_geom = county_lookup_proj[county_name]
            for state_name, state_geom in state_lookup_proj.items():
                if state_geom.contains(county_geom.centroid):
                    states_to_include.add(state_name)
                    break

    city_data = data[data["scale_level"] == "CITY"]
    for city_name in city_data["matched_name"].unique():
        if city_name in cities_lookup_proj:
            city_geom = cities_lookup_proj[city_name]
            for state_name, state_geom in state_lookup_proj.items():
                if state_geom.contains(city_geom.centroid):
                    states_to_include.add(state_name)
                    break

    # Rasterize STATES
    for state_name in states_to_include:
        if state_name in state_lookup_proj:
            state_geom = state_lookup_proj[state_name]
            mask = rasterize(
                [(state_geom, 1)],
                out_shape=(grid_params["height"], grid_params["width"]),
                transform=grid_params["transform"],
                fill=0,
                dtype=np.float32,
                all_touched=True,
            )

            if state_name in state_data["matched_name"].values:
                tweet_count = state_data[state_data["matched_name"] == state_name]["count"].sum()
            else:
                tweet_count = 1

            base_value = np.log1p(float(tweet_count)) * WEIGHTS["STATE"]
            output_grid += mask * base_value

    # Rasterize COUNTIES
    if len(county_data) > 0:
        county_counts = county_data.groupby("matched_name")["count"].sum()
        for county_name, tweet_count in county_counts.items():
            if county_name in county_lookup_proj:
                mask = rasterize(
                    [(county_lookup_proj[county_name], 1)],
                    out_shape=(grid_params["height"], grid_params["width"]),
                    transform=grid_params["transform"],
                    fill=0,
                    dtype=np.float32,
                    all_touched=True,
                )
                output_grid += mask * np.log1p(float(tweet_count)) * WEIGHTS["COUNTY"]

    # Rasterize CITIES using KDE
    if len(city_data) > 0:
        city_grid = create_city_kde_raster(city_data, cities_lookup_proj, grid_params)
        output_grid += city_grid

    # Add FACILITIES
    facility_data = data[data["scale_level"] == "FACILITY"]
    if len(facility_data) > 0:
        output_grid += create_facility_raster(data, grid_params)

    return output_grid


def save_raster(
    grid: np.ndarray,
    output_dir: str,
    hurricane_name: str,
    time_bin: int,
    raster_type: str,
    timestamp_dict: Dict[int, pd.Timestamp],
    grid_params: Dict[str, Any],
) -> None:
    """
    Save a single-band raster to GTiff in a type-specific subfolder.
    """
    type_dir = os.path.join(output_dir, raster_type)
    os.makedirs(type_dir, exist_ok=True)

    time_str = timestamp_dict[time_bin].strftime("%Y%m%d_%H%M%S")
    filename = f"{hurricane_name}_tweets_{time_str}.tif"
    filepath = os.path.join(type_dir, filename)

    with rasterio.open(
        filepath,
        "w",
        driver="GTiff",
        height=grid_params["height"],
        width=grid_params["width"],
        count=1,
        dtype=grid.dtype,
        crs=grid_params["crs"],
        transform=grid_params["transform"],
        compress="lzw",
    ) as dst:
        dst.write(grid, 1)

    print(f"    Saved: {raster_type}/{filename}")


def process_hurricane(
    hurricane_name: str,
    gdf_proj: gpd.GeoDataFrame,
    interval_counts: pd.DataFrame,
    time_bins: list,
    timestamp_dict: Dict[int, pd.Timestamp],
    grid_params: Dict[str, Any],
) -> str:
    """
    Process a single hurricane over all time bins and write rasters.
    """
    print(f"\n{'=' * 60}")
    print(f"PROCESSING: {hurricane_name.upper()}")
    print(f"{'=' * 60}")

    hurricane_dir = os.path.join(OUTPUT_DIR, hurricane_name.lower())
    os.makedirs(hurricane_dir, exist_ok=True)

    cumulative_grid = np.zeros((grid_params["height"], grid_params["width"]), dtype=np.float32)

    for idx, time_bin in enumerate(time_bins):
        print(f"\nTime Bin {idx + 1}/{len(time_bins)}")

        current_data = interval_counts[interval_counts["unix_timestamp"] == time_bin]
        tweet_count = len(current_data)
        print(f"  Tweets in this bin: {tweet_count}")

        incremental_grid = create_hierarchical_rasters(current_data, grid_params, time_bin)
        cumulative_grid += incremental_grid

        save_raster(
            incremental_grid,
            hurricane_dir,
            hurricane_name,
            time_bin,
            "increment",
            timestamp_dict,
            grid_params,
        )
        save_raster(
            cumulative_grid,
            hurricane_dir,
            hurricane_name,
            time_bin,
            "cumulative",
            timestamp_dict,
            grid_params,
        )

        print(f"  Incremental max value: {np.max(incremental_grid):.2f}")
        print(f"  Cumulative  max value: {np.max(cumulative_grid):.2f}")

    print(f"\n{hurricane_name.upper()} processing complete!")
    return hurricane_dir


print("Rasterization functions defined (2/2)")

---
## PIPELINE EXECUTION

Run the following cells sequentially to execute the full pipeline:

### Step 1: Load Hurricane Data

In [None]:
francine_gdf, helene_gdf = load_hurricane_data()

### Step 2: Build Time Lookups

In [None]:
francine_dict, helene_dict = create_timestamp_dictionaries(francine_gdf, helene_gdf)
print("Timestamp dictionaries created")

### Step 3: Load Reference Shapefiles

In [None]:
states_gdf, counties_gdf, cities_gdf = load_reference_shapefiles()

### Step 4: Create Hierarchical Lookups

In [None]:
lookups = create_hierarchical_lookups(states_gdf, counties_gdf, cities_gdf)

### Step 5: Expand Tweets by Matches

In [None]:
francine_gdf = expand_tweets_by_matches(francine_gdf, lookups, "FRANCINE")
helene_gdf = expand_tweets_by_matches(helene_gdf, lookups, "HELENE")

### Step 6: Create Interval Counts

In [None]:
francine_interval_counts = create_interval_counts(francine_gdf)
helene_interval_counts = create_interval_counts(helene_gdf)
print("Interval counts created")

### Step 7: Collect Time Bins

In [None]:
francine_time_bins = get_time_bins(francine_gdf)
helene_time_bins = get_time_bins(helene_gdf)
print(f"Francine time bins: {len(francine_time_bins)}")
print(f"Helene time bins: {len(helene_time_bins)}")

### Step 8: Build Master Grid

In [None]:
grid_params = create_master_grid(
    francine_gdf, helene_gdf, states_gdf, counties_gdf, cities_gdf
)

### Step 9: Ensure Output Directory

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory ready: {OUTPUT_DIR}")

### Step 10: Process Hurricane Francine

In [None]:
francine_output = process_hurricane(
    hurricane_name='francine',
    gdf_proj=grid_params['francine_proj'],
    interval_counts=francine_interval_counts,
    time_bins=francine_time_bins,
    timestamp_dict=francine_dict,
    grid_params=grid_params,
)
print(f"\nFrancine output: {francine_output}")

### Step 11: Process Hurricane Helene

In [None]:
helene_output = process_hurricane(
    hurricane_name='helene',
    gdf_proj=grid_params['helene_proj'],
    interval_counts=helene_interval_counts,
    time_bins=helene_time_bins,
    timestamp_dict=helene_dict,
    grid_params=grid_params,
)
print(f"\nHelene output: {helene_output}")

### Final Summary

In [None]:
import glob

print("\n" + "=" * 80)
print("PIPELINE COMPLETE")
print("=" * 80)

print(f"\nOutput Directories:")
print(f"  Francine: {francine_output}")
print(f"  Helene:   {helene_output}")

# Count files
francine_inc = len(glob.glob(os.path.join(francine_output, 'increment', '*.tif')))
francine_cum = len(glob.glob(os.path.join(francine_output, 'cumulative', '*.tif')))
helene_inc = len(glob.glob(os.path.join(helene_output, 'increment', '*.tif')))
helene_cum = len(glob.glob(os.path.join(helene_output, 'cumulative', '*.tif')))

print(f"\nRasters Created:")
print(f"  Francine: {francine_inc} incremental + {francine_cum} cumulative")
print(f"  Helene:   {helene_inc} incremental + {helene_cum} cumulative")
print(f"  Total:    {francine_inc + francine_cum + helene_inc + helene_cum} GeoTIFF files")

print(f"\nNext Steps:")
print(f"  1. Add raster datasets to ArcGIS Pro map")
print(f"  2. Configure symbology (stretch, color ramp)")
print(f"  3. Enable time slider for animation")
print(f"  4. Export animations as needed")

---
## Optional: Quick Visualization

View a sample raster:

In [None]:
import matplotlib.pyplot as plt
import glob

sample_raster = glob.glob(os.path.join(francine_output, 'cumulative', '*.tif'))[0]

with rasterio.open(sample_raster) as src:
    data = src.read(1)
    
    data_masked = np.ma.masked_equal(data, 0)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(data_masked, cmap='YlOrRd', interpolation='nearest')
    plt.colorbar(label='Tweet Activity')
    plt.title(f'Sample: {os.path.basename(sample_raster)}')
    plt.tight_layout()
    plt.show()
    
    print(f"\nStatistics:")
    print(f"  Min (non-zero): {data[data > 0].min():.2f}")
    print(f"  Max: {data.max():.2f}")
    print(f"  Mean (non-zero): {data[data > 0].mean():.2f}")
    print(f"  Non-zero cells: {np.count_nonzero(data):,}")