In [1]:
# 1.1 Initial Data Loading and Validation

import os
import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import datetime

# Setup paths
print(os.getcwd())
local_path = os.path.dirname(os.getcwd())
francine_path = os.path.join(local_path, r'data\geojson\francine.geojson')
helene_path = os.path.join(local_path, r'data\geojson\helene.geojson')
states_path = os.path.join(local_path, r'data\shape_files\cb_2023_us_state_20m.shp')
counties_path = os.path.join(local_path, r'data\shape_files\cb_2023_us_county_20m.shp')
cities_path = os.path.join(local_path, r'data\shape_files\US_Cities.shp')
output_dir = os.path.join(local_path, r'rasters_output')

francine_gdf = gpd.read_file(francine_path)
helene_gdf = gpd.read_file(helene_path)

# Standardize timestamps to UTC
francine_gdf['timestamp'] = pd.to_datetime(francine_gdf['time'], utc=True)
# print(francine_gdf['time'])
helene_gdf['timestamp'] = pd.to_datetime(helene_gdf['time'], utc=True)


# Floor to 4-hour bins
francine_gdf['time_bin'] = francine_gdf['timestamp'].dt.floor('4h')
helene_gdf['time_bin'] = helene_gdf['timestamp'].dt.floor('4h')
all_data = francine_gdf['time_bin'].unique()
francine_gdf['unix_timestamp'] = francine_gdf['time_bin'].astype('int64') // 1000
helene_gdf['unix_timestamp'] = helene_gdf['time_bin'].astype('int64') // 1000
# Create readable bin labels for file naming
francine_gdf['bin_label'] = francine_gdf['time_bin'].dt.strftime('%Y%m%d_%H%M')
helene_gdf['bin_label'] = helene_gdf['time_bin'].dt.strftime('%Y%m%d_%H%M')


# Load reference data
print("Loading reference data...")
states_gdf = gpd.read_file(states_path)
counties_gdf = gpd.read_file(counties_path)
cities_gdf = gpd.read_file(cities_path)

# Ensure consistent CRS (EPSG:4326)
francine_gdf = francine_gdf.to_crs('EPSG:4326')
helene_gdf = helene_gdf.to_crs('EPSG:4326')
states_gdf = states_gdf.to_crs('EPSG:4326')
counties_gdf = counties_gdf.to_crs('EPSG:4326')
cities_gdf = cities_gdf.to_crs('EPSG:4326')

# Summary statistics

print(f"\nReference data loaded:")
print(f"  States: {len(states_gdf)}")
print(f"  Counties: {len(counties_gdf)}")
print(f"  Cities: {len(cities_gdf)}")

C:\Users\colto\Documents\GitHub\Tweet_project\src
Loading reference data...

Reference data loaded:
  States: 52
  Counties: 3222
  Cities: 31615


In [2]:
states_gdf = gpd.read_file(states_path)
counties_gdf = gpd.read_file(counties_path)
cities_gdf = gpd.read_file(cities_path)

# PLACE THIS CODE AFTER LOADING SHAPEFILES BUT BEFORE CREATING SIMPLE LOOKUPS
# =============================================================================
# MULTI-LEVEL GEOGRAPHIC MATCHING SYSTEM (ALL LEVELS)
# =============================================================================

from fuzzywuzzy import fuzz, process
import re

def preprocess_place_name(name):
    """Standardize place names for better matching"""
    if pd.isna(name) or name == 'NAN':
        return None

    name = str(name).upper().strip()

    # Common abbreviation standardizations
    name = re.sub(r'\bST\.?\b', 'SAINT', name)  # St. -> Saint
    name = re.sub(r'\bMT\.?\b', 'MOUNT', name)  # Mt. -> Mount
    name = re.sub(r'\bFT\.?\b', 'FORT', name)   # Ft. -> Fort
    name = re.sub(r'\bN\.?\b', 'NORTH', name)   # N. -> North
    name = re.sub(r'\bS\.?\b', 'SOUTH', name)   # S. -> South
    name = re.sub(r'\bE\.?\b', 'EAST', name)    # E. -> East
    name = re.sub(r'\bW\.?\b', 'WEST', name)    # W. -> West

    # Remove extra spaces and punctuation
    name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
    name = re.sub(r'\s+', ' ', name)     # Normalize spaces

    return name.strip()

def parse_gpe_entities(gpe_string):
    """Parse GPE string into multiple potential geographic entities"""
    if not gpe_string or pd.isna(gpe_string) or str(gpe_string).strip() == '':
        return []

    gpe_string = str(gpe_string).strip()

    # Split by common separators
    entities = []

    # Primary split by comma
    parts = [part.strip() for part in gpe_string.split(',')]

    for part in parts:
        if part:
            # Further split by other separators
            sub_parts = re.split(r'[;&|]', part)
            for sub_part in sub_parts:
                sub_part = sub_part.strip()
                if sub_part and len(sub_part) > 1:  # Ignore single characters
                    entities.append(preprocess_place_name(sub_part))

    # Remove None values and duplicates while preserving order
    clean_entities = []
    seen = set()
    for entity in entities:
        if entity and entity not in seen:
            clean_entities.append(entity)
            seen.add(entity)

    return clean_entities

def create_hierarchical_lookups(states_gdf, counties_gdf, cities_gdf):
    """Create hierarchical lookup dictionaries for fuzzy matching"""
    print("\nCreating hierarchical lookup dictionaries...")

    # 1. States - simple lookup with preprocessed names + abbreviations
    state_lookup = {}
    state_abbrev_to_name = {}  # Abbreviation to full name
    state_name_to_abbrev = {}  # Full name to abbreviation

    for idx, row in states_gdf.iterrows():
        state_name = preprocess_place_name(row['NAME'])
        if state_name:
            state_lookup[state_name] = row.geometry
            # Handle abbreviations if available
            if 'STUSPS' in row:
                abbrev = row['STUSPS'].upper()
                state_abbrev_to_name[abbrev] = state_name
                state_name_to_abbrev[state_name] = abbrev
                # Also add abbreviation as a lookup option
                state_lookup[abbrev] = row.geometry

    # 2. Counties - organized by state
    county_by_state = {}
    county_lookup = {}

    for idx, row in counties_gdf.iterrows():
        county_name = preprocess_place_name(row['NAME'])
        state_fips = row.get('STATEFP', '')

        if county_name:
            county_lookup[county_name] = row.geometry

            # Try to get state name from STATEFP or other fields
            state_name = None
            if 'STATE_NAME' in row:
                state_name = preprocess_place_name(row['STATE_NAME'])
            else:
                # Try to find state by FIPS code
                for s_idx, s_row in states_gdf.iterrows():
                    if s_row.get('STATEFP', '') == state_fips:
                        state_name = preprocess_place_name(s_row['NAME'])
                        break

            if state_name:
                if state_name not in county_by_state:
                    county_by_state[state_name] = {}
                county_by_state[state_name][county_name] = row.geometry

    # 3. Cities - organized by state
    city_by_state = {}
    city_lookup = {}

    for idx, row in cities_gdf.iterrows():
        city_name = preprocess_place_name(row['NAME'])
        state_abbrev = row.get('ST', '').upper()

        if city_name:
            city_lookup[city_name] = row.geometry

            # Convert state abbreviation to full name
            if state_abbrev in state_abbrev_to_name:
                state_full = state_abbrev_to_name[state_abbrev]
                if state_full not in city_by_state:
                    city_by_state[state_full] = {}
                city_by_state[state_full][city_name] = row.geometry
    #
    # print(f"  States: {len(state_lookup)} (including abbreviations)")
    # print(f"  Counties: {len(county_lookup)} (organized by {len(county_by_state)} states)")
    # print(f"  Cities: {len(city_lookup)} (organized by {len(city_by_state)} states)")

    return {
        'state_lookup': state_lookup,
        'county_lookup': county_lookup,
        'city_lookup': city_lookup,
        'county_by_state': county_by_state,
        'city_by_state': city_by_state,
        'state_abbrev_to_name': state_abbrev_to_name,
        'state_name_to_abbrev': state_name_to_abbrev
    }

def fuzzy_match_entity(entity, candidates, threshold=75):
    """Fuzzy match an entity against candidates"""
    if not entity or not candidates:
        return None, 0

    # Try exact match first
    if entity in candidates:
        return entity, 100

    # Use fuzzy matching
    match = process.extractOne(entity, candidates.keys(), scorer=fuzz.ratio)

    if match and match[1] >= threshold:
        return match[0], match[1]

    return None, 0

def find_all_geographic_matches(entities, lookups):
    """Find ALL geographic matches (state, county, city) for the entities"""
    if not entities:
        return []

    state_lookup = lookups['state_lookup']
    county_lookup = lookups['county_lookup']
    city_lookup = lookups['city_lookup']
    county_by_state = lookups['county_by_state']
    city_by_state = lookups['city_by_state']

    # Store all successful matches
    all_matches = []

    # Context tracking for better matching
    found_states = set()

    # STEP 1: Find all state matches first
    for entity in entities:
        state_match, state_score = fuzzy_match_entity(entity, state_lookup, threshold=75)
        if state_match:
            all_matches.append(('STATE', state_match, state_lookup[state_match], state_score))
            found_states.add(state_match)

    # STEP 2: Find county matches (global first, then state-specific)
    for entity in entities:
        # Global county search
        county_match, county_score = fuzzy_match_entity(entity, county_lookup, threshold=75)
        if county_match:
            all_matches.append(('COUNTY', county_match, county_lookup[county_match], county_score))

        # State-specific county search (higher accuracy)
        for state_name in found_states:
            if state_name in county_by_state:
                state_counties = county_by_state[state_name]
                state_county_match, state_county_score = fuzzy_match_entity(entity, state_counties, threshold=70)
                if state_county_match and state_county_score > county_score:
                    # Replace with better state-specific match
                    # Remove the global match if it exists
                    all_matches = [m for m in all_matches if not (m[0] == 'COUNTY' and m[1] == county_match)]
                    all_matches.append(('COUNTY', state_county_match, state_counties[state_county_match], state_county_score))

    # STEP 3: Find city matches (global first, then state-specific)
    for entity in entities:
        # Global city search
        city_match, city_score = fuzzy_match_entity(entity, city_lookup, threshold=75)
        if city_match:
            all_matches.append(('CITY', city_match, city_lookup[city_match], city_score))

        # State-specific city search (higher accuracy)
        for state_name in found_states:
            if state_name in city_by_state:
                state_cities = city_by_state[state_name]
                state_city_match, state_city_score = fuzzy_match_entity(entity, state_cities, threshold=70)
                if state_city_match and state_city_score > city_score:
                    # Replace with better state-specific match
                    # Remove the global match if it exists
                    all_matches = [m for m in all_matches if not (m[0] == 'CITY' and m[1] == city_match)]
                    all_matches.append(('CITY', state_city_match, state_cities[state_city_match], state_city_score))

    # Remove duplicates (same scale + name)
    unique_matches = []
    seen_combinations = set()
    for match in all_matches:
        combo = (match[0], match[1])  # (scale, name)
        if combo not in seen_combinations:
            unique_matches.append(match)
            seen_combinations.add(combo)

    return unique_matches

def multi_level_assign_scale_levels(row, lookups):
    """
    Return ALL geographic scale levels that match this tweet
    Returns a list of matches: [(scale, name, geom, score), ...]
    """
    gpe = str(row.get('GPE', '')).strip()
    fac = str(row.get('FAC', '')).strip()

    matches = []

    # Parse GPE into multiple entities
    entities = parse_gpe_entities(gpe)

    if entities:
        # Find all geographic matches
        geo_matches = find_all_geographic_matches(entities, lookups)
        matches.extend(geo_matches)

    # Add facility as separate match if available
    if fac and fac not in ['nan', 'NAN', '']:
        matches.append(('FACILITY', fac, row.geometry, 100))

    # If no matches found, return unmatched
    if not matches:
        matches.append(('UNMATCHED', None, row.geometry, 0))

    return matches

def expand_tweets_by_matches(gdf, lookups, dataset_name):
    """
    Expand the GeoDataFrame so each tweet creates multiple rows (one per geographic match)
    """
    # print(f"\nExpanding {dataset_name} tweets by geographic matches...")

    expanded_rows = []

    for idx, row in gdf.iterrows():
        matches = multi_level_assign_scale_levels(row, lookups)

        # Create one row per match
        for scale, name, geom, score in matches:
            new_row = row.copy()
            new_row['scale_level'] = scale
            new_row['matched_name'] = name
            new_row['matched_geom'] = geom
            new_row['match_score'] = score
            new_row['original_index'] = idx  # Track original tweet
            expanded_rows.append(new_row)

    # Create new GeoDataFrame and preserve the original CRS
    expanded_gdf = gpd.GeoDataFrame(expanded_rows, crs=gdf.crs)

    # Print statistics
    original_count = len(gdf)
    expanded_count = len(expanded_gdf)
    expansion_ratio = expanded_count / original_count

    # print(f"  Original tweets: {original_count}")
    # print(f"  Expanded rows: {expanded_count}")
    # print(f"  Expansion ratio: {expansion_ratio:.2f}x")

    # Print scale distribution
    # scale_counts = expanded_gdf['scale_level'].value_counts()
    # print(f"  {dataset_name} scale distribution:")
    # for scale, count in scale_counts.items():
    #     print(f"    {scale}: {count}")

    # Print average match scores by scale level
    # print(f"  Average match scores:")
    # for scale in ['STATE', 'COUNTY', 'CITY', 'FACILITY']:
    #     if scale in expanded_gdf['scale_level'].values:
    #         avg_score = expanded_gdf[expanded_gdf['scale_level'] == scale]['match_score'].mean()
            # print(f"    {scale}: {avg_score:.1f}%")

    # Show some examples of multi-level matches
    print(f"  Sample multi-level matches:")
    # Group by original tweet and show ones with multiple matches
    multi_matches = expanded_gdf.groupby('original_index').size()
    multi_match_indices = multi_matches[multi_matches > 1].head(5).index

    for orig_idx in multi_match_indices:
        tweet_matches = expanded_gdf[expanded_gdf['original_index'] == orig_idx]
        original_gpe = tweet_matches.iloc[0]['GPE']
        match_summary = ', '.join([f"{row['scale_level']}:{row['matched_name']}" for _, row in tweet_matches.iterrows()])
        # print(f"    '{original_gpe}' → {match_summary}")

    return expanded_gdf

# =============================================================================
# EXECUTE MULTI-LEVEL FUZZY MATCHING
# =============================================================================

print("\n" + "="*60)
print("MULTI-LEVEL GEOGRAPHIC MATCHING (ALL LEVELS)")
print("="*60)

# Create hierarchical lookups
lookups = create_hierarchical_lookups(states_gdf, counties_gdf, cities_gdf)

# Apply to both datasets (this will expand the datasets)
# francine_gdf = expand_tweets_by_matches(francine_gdf, lookups, "FRANCINE")
helene_gdf = expand_tweets_by_matches(helene_gdf, lookups, "HELENE")

print("\n" + "="*60)
print("MULTI-LEVEL FUZZY MATCHING COMPLETE ✓")
print("="*60)
print("\nNote: Datasets are now expanded - each original tweet may have multiple rows")
print("representing different geographic scales (STATE, COUNTY, CITY, etc.)")


MULTI-LEVEL GEOGRAPHIC MATCHING (ALL LEVELS)

Creating hierarchical lookup dictionaries...
  Sample multi-level matches:

MULTI-LEVEL FUZZY MATCHING COMPLETE ✓

Note: Datasets are now expanded - each original tweet may have multiple rows
representing different geographic scales (STATE, COUNTY, CITY, etc.)


In [3]:
# =============================================================================
# STEP 2: SPATIAL PROCESSING FRAMEWORK
# =============================================================================

import numpy as np
import rasterio
from rasterio.transform import from_bounds
from rasterio.features import rasterize
from scipy.ndimage import gaussian_filter

# =============================================================================
# 2.1 GRID DEFINITION
# =============================================================================

def create_master_grid(francine_gdf, helene_gdf, buffer_percent=10):
    """Create consistent raster grid parameters for both hurricanes"""
    print("\n" + "="*60)
    print("CREATING MASTER GRID")
    print("="*60)

    # Configuration
    TARGET_CRS = 'EPSG:3857'  # Web Mercator
    CELL_SIZE_M = 5000  # 5km cells

    # Project both datasets
    print(f"\nProjecting to {TARGET_CRS}...")
    francine_proj = francine_gdf.to_crs(TARGET_CRS)
    helene_proj = helene_gdf.to_crs(TARGET_CRS)

    # Calculate combined extent
    f_bounds = francine_proj.total_bounds
    h_bounds = helene_proj.total_bounds

    minx = min(f_bounds[0], h_bounds[0])
    miny = min(f_bounds[1], h_bounds[1])
    maxx = max(f_bounds[2], h_bounds[2])
    maxy = max(f_bounds[3], h_bounds[3])

    # Add buffer
    width_buffer = (maxx - minx) * buffer_percent / 100
    height_buffer = (maxy - miny) * buffer_percent / 100

    minx -= width_buffer
    maxx += width_buffer
    miny -= height_buffer
    maxy += height_buffer

    # Calculate grid dimensions
    width = int(np.ceil((maxx - minx) / CELL_SIZE_M))
    height = int(np.ceil((maxy - miny) / CELL_SIZE_M))

    # Create transform
    transform = from_bounds(minx, miny, maxx, maxy, width, height)

    grid_params = {
        'crs': TARGET_CRS,
        'cell_size': CELL_SIZE_M,
        'width': width,
        'height': height,
        'bounds': (minx, miny, maxx, maxy),
        'transform': transform,
        'shape': (height, width)
    }

    print(f"\nGrid Configuration:")
    print(f"  Cell size: {CELL_SIZE_M:,} meters")
    print(f"  Grid dimensions: {width} x {height} cells")
    print(f"  Total cells: {width * height:,}")
    print(f"  Coverage area: {(width * height * CELL_SIZE_M * CELL_SIZE_M) / 1e6:,.0f} km²")

    return grid_params, francine_proj, helene_proj

# =============================================================================
# 2.2 HIERARCHICAL RASTERIZATION
# =============================================================================

def rasterize_by_scale(gdf_time_bin, grid_params, reference_gdfs):
    """Rasterize all geographic scales for a single time bin"""

    # Initialize grids for each scale
    state_grid = np.zeros(grid_params['shape'], dtype=np.float32)
    county_grid = np.zeros(grid_params['shape'], dtype=np.float32)
    city_grid = np.zeros(grid_params['shape'], dtype=np.float32)
    facility_grid = np.zeros(grid_params['shape'], dtype=np.float32)

    if len(gdf_time_bin) == 0:
        return state_grid, county_grid, city_grid, facility_grid

    # Project reference data to match grid CRS
    states_proj = reference_gdfs['states'].to_crs(grid_params['crs'])
    counties_proj = reference_gdfs['counties'].to_crs(grid_params['crs'])
    cities_proj = reference_gdfs['cities'].to_crs(grid_params['crs'])

    # STATES
    state_data = gdf_time_bin[gdf_time_bin['scale_level'] == 'STATE']
    if len(state_data) > 0:
        state_counts = state_data.groupby('matched_name')['weight'].sum()
        for state_name, count in state_counts.items():
            state_geom = states_proj[states_proj['NAME'].str.upper() == state_name]
            if not state_geom.empty:
                shapes = [(geom, count) for geom in state_geom.geometry]
                temp = rasterize(shapes, out_shape=grid_params['shape'],
                               transform=grid_params['transform'], fill=0, all_touched=True)
                state_grid += temp

    # COUNTIES
    county_data = gdf_time_bin[gdf_time_bin['scale_level'] == 'COUNTY']
    if len(county_data) > 0:
        county_counts = county_data.groupby('matched_name')['weight'].sum()
        for county_name, count in county_counts.items():
            county_geom = counties_proj[counties_proj['NAME'].str.upper() == county_name]
            if not county_geom.empty:
                shapes = [(geom, count) for geom in county_geom.geometry]
                temp = rasterize(shapes, out_shape=grid_params['shape'],
                               transform=grid_params['transform'], fill=0, all_touched=True)
                county_grid += temp

    # CITIES (with 5km buffers)
    city_data = gdf_time_bin[gdf_time_bin['scale_level'] == 'CITY']
    if len(city_data) > 0:
        city_counts = city_data.groupby('matched_name')['weight'].sum()
        for city_name, count in city_counts.items():
            city_geom = cities_proj[cities_proj['NAME'].str.upper() == city_name]
            if not city_geom.empty:
                # Create 5km buffer around city points
                buffered = city_geom.geometry.buffer(grid_params['cell_size'])
                shapes = [(geom, count) for geom in buffered]
                temp = rasterize(shapes, out_shape=grid_params['shape'],
                               transform=grid_params['transform'], fill=0, all_touched=True)
                city_grid += temp

    # FACILITIES (kernel density)
    facility_data = gdf_time_bin[gdf_time_bin['scale_level'] == 'FACILITY']
    if len(facility_data) > 0:
        sigma_pixels = 2  # 10km kernel (2 * 5km cells)
        for _, row in facility_data.iterrows():
            # Project facility point
            point_proj = row['matched_geom']
            # Convert to pixel coordinates
            px = int((point_proj.x - grid_params['bounds'][0]) / grid_params['cell_size'])
            py = int((grid_params['bounds'][3] - point_proj.y) / grid_params['cell_size'])

            if 0 <= px < grid_params['width'] and 0 <= py < grid_params['height']:
                # Create point source
                point_grid = np.zeros(grid_params['shape'], dtype=np.float32)
                point_grid[py, px] = row['weight']
                # Apply Gaussian kernel
                kernel = gaussian_filter(point_grid, sigma=sigma_pixels, mode='constant')
                facility_grid += kernel

    return state_grid, county_grid, city_grid, facility_grid

# =============================================================================
# 2.3 GRID INTEGRATION
# =============================================================================

def integrate_grids(state_grid, county_grid, city_grid, facility_grid, smooth=True):
    """Combine all scale grids into unified heat map"""

    # Simple additive model
    unified_grid = state_grid + county_grid + city_grid + facility_grid

    # Optional smoothing
    if smooth:
        unified_grid = gaussian_filter(unified_grid, sigma=1, mode='constant')

    return unified_grid

# =============================================================================
# EXECUTE GRID CREATION
# =============================================================================

# Create master grid
grid_params, francine_proj, helene_proj = create_master_grid(francine_gdf, helene_gdf)

# Store reference GeoDataFrames for rasterization
reference_gdfs = {
    'states': states_gdf,
    'counties': counties_gdf,
    'cities': cities_gdf
}

print("\n" + "="*60)
print("SPATIAL PROCESSING FRAMEWORK READY ✓")
print("="*60)


CREATING MASTER GRID

Projecting to EPSG:3857...

Grid Configuration:
  Cell size: 5,000 meters
  Grid dimensions: 808 x 527 cells
  Total cells: 425,816
  Coverage area: 10,645,400 km²

SPATIAL PROCESSING FRAMEWORK READY ✓


In [4]:
# =============================================================================
# STEP 3: TEMPORAL PROCESSING
# =============================================================================

import os
from datetime import datetime

# =============================================================================
# 3.1 TIME SERIES GENERATION
# =============================================================================

def process_hurricane_temporal(hurricane_name, gdf_proj, grid_params, reference_gdfs, output_dir):
    """Process a hurricane through all time bins to create temporal series"""

    print(f"\n" + "="*60)
    print(f"PROCESSING {hurricane_name.upper()} TEMPORAL SERIES")
    print(f"="*60)

    # Create output directory
    hurricane_dir = os.path.join(output_dir, hurricane_name.lower())
    os.makedirs(hurricane_dir, exist_ok=True)

    # Get unique time bins (sorted)
    time_bins = sorted(gdf_proj['unix_timestamp'].unique())
    print(f"\nTime bins to process: {len(time_bins)}")

    # Initialize cumulative grid
    cumulative_grid = np.zeros(grid_params['shape'], dtype=np.float32)

    # Store metadata for each time step
    metadata = []

    # Process each time bin
    for idx, time_bin in enumerate(time_bins):
        print(f"\nProcessing time bin {idx+1}/{len(time_bins)}: {datetime.fromtimestamp(time_bin/1000)}")

        # Filter data for current time bin
        gdf_time = gdf_proj[gdf_proj['unix_timestamp'] == time_bin]
        tweet_count = len(gdf_time[gdf_time['scale_level'] != 'UNMATCHED'])

        # Rasterize all scales
        state_grid, county_grid, city_grid, facility_grid = rasterize_by_scale(
            gdf_time, grid_params, reference_gdfs
        )

        # Create incremental grid (current time bin only)
        incremental_grid = integrate_grids(state_grid, county_grid, city_grid, facility_grid)

        # Update cumulative grid
        cumulative_grid += incremental_grid

        # Save rasters
        save_raster(incremental_grid, hurricane_dir,
                   f"{hurricane_name}_increment_{time_bin}.tif",
                   grid_params)
        save_raster(cumulative_grid, hurricane_dir,
                   f"{hurricane_name}_cumulative_{time_bin}.tif",
                   grid_params)

        # Collect metadata
        metadata.append({
            'time_bin': time_bin,
            'datetime': datetime.fromtimestamp(time_bin/1000),
            'tweet_count': tweet_count,
            'incremental_max': np.max(incremental_grid),
            'incremental_sum': np.sum(incremental_grid),
            'cumulative_max': np.max(cumulative_grid),
            'cumulative_sum': np.sum(cumulative_grid),
            'active_pixels': np.count_nonzero(incremental_grid)
        })

        print(f"  Tweets: {tweet_count} | Max value: {np.max(incremental_grid):.1f} | Active pixels: {np.count_nonzero(incremental_grid)}")

    # Save metadata
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv(os.path.join(hurricane_dir, f"{hurricane_name}_metadata.csv"), index=False)

    print(f"\n{hurricane_name.upper()} processing complete!")
    print(f"Output saved to: {hurricane_dir}")

    return metadata_df

def save_raster(grid, output_dir, filename, grid_params):
    """Save a grid as GeoTIFF"""
    filepath = os.path.join(output_dir, filename)

    with rasterio.open(
        filepath, 'w',
        driver='GTiff',
        height=grid_params['height'],
        width=grid_params['width'],
        count=1,
        dtype=grid.dtype,
        crs=grid_params['crs'],
        transform=grid_params['transform'],
        compress='lzw'
    ) as dst:
        dst.write(grid, 1)

# =============================================================================
# 3.2 TEMPORAL INTERPOLATION (OPTIONAL)
# =============================================================================

def create_interpolated_frames(hurricane_dir, metadata_df, interpolation_steps=3):
    """Create interpolated frames between time bins for smoother animation"""

    print("\nCreating interpolated frames...")

    time_bins = sorted(metadata_df['time_bin'].values)

    for i in range(len(time_bins) - 1):
        current_bin = time_bins[i]
        next_bin = time_bins[i + 1]

        # Load current and next incremental grids
        current_file = os.path.join(hurricane_dir, f"*_increment_{current_bin}.tif")
        next_file = os.path.join(hurricane_dir, f"*_increment_{next_bin}.tif")

        # Skip actual interpolation for now (would need to load rasters)
        # This is a placeholder for the interpolation logic

    print("  Interpolation complete (optional step)")

# =============================================================================
# EXECUTE TEMPORAL PROCESSING
# =============================================================================

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Process Francine
francine_metadata = process_hurricane_temporal(
    'francine', francine_proj, grid_params, reference_gdfs, output_dir
)

# Process Helene
helene_metadata = process_hurricane_temporal(
    'helene', helene_proj, grid_params, reference_gdfs, output_dir
)

# Optional: Create interpolated frames (commented out for performance)
# create_interpolated_frames(os.path.join(output_dir, 'francine'), francine_metadata)
# create_interpolated_frames(os.path.join(output_dir, 'helene'), helene_metadata)

print("\n" + "="*60)
print("TEMPORAL PROCESSING COMPLETE ✓")
print("="*60)
print(f"\nOutputs saved to: {output_dir}")
print("  - Incremental rasters (per time bin)")
print("  - Cumulative rasters (running total)")
print("  - Metadata CSV files")


PROCESSING FRANCINE TEMPORAL SERIES

Time bins to process: 42

Processing time bin 1/42: 1970-01-20 17:24:28.800000


KeyError: 'scale_level'