In [87]:
import geopandas as gpd
import pandas as pd
from datetime import datetime, timezone

# Load GeoJSON files
francine_path = r"C:\Users\colto\Documents\GitHub\Tweet_project\data\geojson\francine.geojson"
helene_path = r"C:\Users\colto\Documents\GitHub\Tweet_project\data\geojson\helene.geojson"

# Load into GeoDataFrames
francine_gdf = gpd.read_file(francine_path)
helene_gdf = gpd.read_file(helene_path)

# Standardize timestamps to UTC
francine_gdf['timestamp'] = pd.to_datetime(francine_gdf['time'], utc=True)
# print(francine_gdf['time'])
helene_gdf['timestamp'] = pd.to_datetime(helene_gdf['time'], utc=True)


# Floor to 4-hour bins
francine_gdf['time_bin'] = francine_gdf['timestamp'].dt.floor('4h')
helene_gdf['time_bin'] = helene_gdf['timestamp'].dt.floor('4h')
all_data = francine_gdf['time_bin'].unique()
francine_gdf['unix_timestamp'] = francine_gdf['time_bin'].astype('int64') // 1000
helene_gdf['unix_timestamp'] = helene_gdf['time_bin'].astype('int64') // 1000
# Create readable bin labels for file naming
francine_gdf['bin_label'] = francine_gdf['time_bin'].dt.strftime('%Y%m%d_%H%M')
helene_gdf['bin_label'] = helene_gdf['time_bin'].dt.strftime('%Y%m%d_%H%M')
# Display summary
print("FRANCINE Dataset:")
print(f"  Total tweets: {len(francine_gdf)}")
print(f"  Time range: {francine_gdf['time_bin'].min()} to {francine_gdf['time_bin'].max()}")
print(f"  Number of 4-hour bins: {francine_gdf['time_bin'].nunique()}")
print(f"\nHELENE Dataset:")
print(f"  Total tweets: {len(helene_gdf)}")
print(f"  Time range: {helene_gdf['time_bin'].min()} to {helene_gdf['time_bin'].max()}")
print(f"  Number of 4-hour bins: {helene_gdf['time_bin'].nunique()}")

FRANCINE Dataset:
  Total tweets: 2303
  Time range: 2024-09-09 08:00:00+00:00 to 2024-09-16 12:00:00+00:00
  Number of 4-hour bins: 42

HELENE Dataset:
  Total tweets: 3007
  Time range: 2024-09-26 00:00:00+00:00 to 2024-09-27 16:00:00+00:00
  Number of 4-hour bins: 11


In [88]:
# Load reference shapefiles
states_path = r"C:\Users\colto\Documents\GitHub\Tweet_project\data\shape_files\cb_2023_us_state_20m.shp"
counties_path = r"C:\Users\colto\Documents\GitHub\Tweet_project\data\shape_files\cb_2023_us_county_20m.shp"
cities_path = r"C:\Users\colto\Documents\GitHub\Tweet_project\data\tables\cities1000.csv"

# Load spatial reference data
states_gdf = gpd.read_file(states_path)
counties_gdf = gpd.read_file(counties_path)
cities_df = pd.read_csv(cities_path)

# Create lookup dictionaries for matching
# States: NAME field maps to geometry
state_lookup = dict(zip(states_gdf['NAME'].str.upper(), states_gdf.geometry))

# Counties: NAME field maps to geometry
county_lookup = dict(zip(counties_gdf['NAME'].str.upper(), counties_gdf.geometry))

# Cities: name field with coordinates for point creation
cities_lookup = dict(zip(cities_df['name'].str.upper(),
                        gpd.points_from_xy(cities_df['longitude'], cities_df['latitude'])))

# Function to assign scale level and matched geometry
def assign_scale_level(row):
    """Determine geographic scale and match to geometry"""
    gpe = str(row.get('GPE', '')).upper().strip()
    fac = str(row.get('FAC', '')).upper().strip()

    # Priority: State > County > City > Facility
    if gpe in state_lookup:
        return 'STATE', gpe, state_lookup[gpe]
    elif gpe in county_lookup:
        return 'COUNTY', gpe, county_lookup[gpe]
    elif gpe in cities_lookup:
        return 'CITY', gpe, cities_lookup[gpe]
    elif fac and fac != 'NAN':
        return 'FACILITY', fac, row.geometry  # Use tweet's geocoded point
    else:
        return 'UNMATCHED', None, row.geometry

# Apply to both datasets
francine_gdf[['scale_level', 'matched_name', 'matched_geom']] = francine_gdf.apply(
    assign_scale_level, axis=1, result_type='expand')

helene_gdf[['scale_level', 'matched_name', 'matched_geom']] = helene_gdf.apply(
    assign_scale_level, axis=1, result_type='expand')

# Display scale distribution
print("FRANCINE Scale Distribution:")
print(francine_gdf['scale_level'].value_counts())
print(f"\nHELENE Scale Distribution:")
print(helene_gdf['scale_level'].value_counts())

  cities_df = pd.read_csv(cities_path)


FRANCINE Scale Distribution:
scale_level
STATE        1270
UNMATCHED     698
CITY          276
COUNTY         32
FACILITY       27
Name: count, dtype: int64

HELENE Scale Distribution:
scale_level
STATE        1673
UNMATCHED     962
CITY          248
FACILITY       76
COUNTY         48
Name: count, dtype: int64


In [89]:
# Group tweets by 4-hour intervals and scale level
# Using unix_timestamp for unambiguous temporal grouping

# Alternative approach:
francine_interval_counts = francine_gdf.groupby(['unix_timestamp', 'scale_level', 'matched_name']).agg({
    'matched_geom': 'first'
}).reset_index()

# Add count column separately
count_series = francine_gdf.groupby(['unix_timestamp', 'scale_level', 'matched_name']).size()
francine_interval_counts['count'] = count_series.values

# Same for Helene
helene_interval_counts = helene_gdf.groupby(['unix_timestamp', 'scale_level', 'matched_name']).agg({
    'matched_geom': 'first'
}).reset_index()
count_series = helene_gdf.groupby(['unix_timestamp', 'scale_level', 'matched_name']).size()
helene_interval_counts['count'] = count_series.values

# Sort by timestamp to ensure chronological order
francine_interval_counts = francine_interval_counts.sort_values('unix_timestamp')
helene_interval_counts = helene_interval_counts.sort_values('unix_timestamp')

# Calculate cumulative counts
francine_interval_counts['cumulative_count'] = francine_interval_counts.groupby(['scale_level', 'matched_name'])['count'].cumsum()
helene_interval_counts['cumulative_count'] = helene_interval_counts.groupby(['scale_level', 'matched_name'])['count'].cumsum()

# Get unique time bins for iteration
francine_time_bins = sorted(francine_gdf['unix_timestamp'].unique())
helene_time_bins = sorted(helene_gdf['unix_timestamp'].unique())

# Display summary statistics
print("FRANCINE Time Binning Summary:")
print(f"  Total time bins: {len(francine_time_bins)}")
print(f"  Total location-time combinations: {len(francine_interval_counts)}")
print(f"\nSample interval counts:")
print(francine_interval_counts.head(10))

print(f"\nHELENE Time Binning Summary:")
print(f"  Total time bins: {len(helene_time_bins)}")
print(f"  Total location-time combinations: {len(helene_interval_counts)}")
print(f"\nSample interval counts:")
print(helene_interval_counts.head(10))

FRANCINE Time Binning Summary:
  Total time bins: 42
  Total location-time combinations: 250

Sample interval counts:
   unix_timestamp scale_level matched_name  \
0      1725868800       STATE    LOUISIANA   
1      1725868800       STATE        TEXAS   
2      1725883200      COUNTY       DALLAS   
3      1725883200       STATE     ARKANSAS   
4      1725883200       STATE    LOUISIANA   
5      1725883200       STATE        TEXAS   
6      1725897600    FACILITY         I-10   
7      1725897600       STATE    LOUISIANA   
8      1725897600       STATE  MISSISSIPPI   
9      1725912000      COUNTY    LAFAYETTE   

                                        matched_geom  count  cumulative_count  
0  POLYGON ((-94.0430515276176 32.6930299766656, ...      1                 1  
1  POLYGON ((-106.623445 31.914034, -106.630114 3...      1                 1  
2  POLYGON ((-97.036295 32.693227, -97.035996 32....      1                 1  
3  POLYGON ((-94.617919 36.499414, -94.361203 36....   

In [90]:
import numpy as np
import rasterio
from rasterio.transform import from_bounds

# ==============================================================================
# STEP 1: DEFINE MASTER GRID CANVAS
# ==============================================================================

# Configuration
TARGET_CRS = 'EPSG:3857'  # Web Mercator
CELL_SIZE_M = 5000  # 5 km in meters

print("=" * 60)
print("STEP 1: CREATING MASTER GRID CANVAS")
print("=" * 60)

# Project both datasets to target CRS
print(f"\nProjecting datasets to {TARGET_CRS}...")
francine_proj = francine_gdf.to_crs(TARGET_CRS)
helene_proj = helene_gdf.to_crs(TARGET_CRS)

# Also project reference geometries
print("Projecting reference geometries...")
states_proj = states_gdf.to_crs(TARGET_CRS)
counties_proj = counties_gdf.to_crs(TARGET_CRS)

# Calculate combined extent from both hurricanes
print("\nCalculating master extent...")
francine_bounds = francine_proj.total_bounds
helene_bounds = helene_proj.total_bounds

# Get union of both bounding boxes
minx = min(francine_bounds[0], helene_bounds[0])
miny = min(francine_bounds[1], helene_bounds[1])
maxx = max(francine_bounds[2], helene_bounds[2])
maxy = max(francine_bounds[3], helene_bounds[3])

print(f"  Master bounds (EPSG:3857):")
print(f"    minx: {minx:,.2f}")
print(f"    miny: {miny:,.2f}")
print(f"    maxx: {maxx:,.2f}")
print(f"    maxy: {maxy:,.2f}")

# Calculate grid dimensions
width = int(np.ceil((maxx - minx) / CELL_SIZE_M))
height = int(np.ceil((maxy - miny) / CELL_SIZE_M))

print(f"\nGrid Configuration:")
print(f"  Cell size: {CELL_SIZE_M:,} meters ({CELL_SIZE_M/1000} km)")
print(f"  Grid dimensions: {width} x {height} cells")
print(f"  Total cells: {width * height:,}")

# Create master transform
master_transform = from_bounds(minx, miny, maxx, maxy, width, height)

print(f"\nMaster Transform:")
print(f"  {master_transform}")

# Calculate actual coverage area
area_km2 = (width * height * CELL_SIZE_M * CELL_SIZE_M) / 1_000_000
print(f"\nCoverage area: {area_km2:,.2f} km²")

# Store grid parameters for later use
grid_params = {
    'crs': TARGET_CRS,
    'cell_size': CELL_SIZE_M,
    'width': width,
    'height': height,
    'bounds': (minx, miny, maxx, maxy),
    'transform': master_transform
}

print(f"\n{'=' * 60}")
print("MASTER GRID CANVAS READY ✓")
print(f"{'=' * 60}")

# Update lookup dictionaries with projected geometries
print("\nUpdating geometry lookups with projected coordinates...")
state_lookup_proj = dict(zip(states_proj['NAME'].str.upper(), states_proj.geometry))
county_lookup_proj = dict(zip(counties_proj['NAME'].str.upper(), counties_proj.geometry))

print("Lookup dictionaries updated with projected geometries ✓")

STEP 1: CREATING MASTER GRID CANVAS

Projecting datasets to EPSG:3857...
Projecting reference geometries...

Calculating master extent...
  Master bounds (EPSG:3857):
    minx: -11,854,083.11
    miny: 2,947,395.71
    maxx: -8,490,833.94
    maxy: 5,142,357.36

Grid Configuration:
  Cell size: 5,000 meters (5.0 km)
  Grid dimensions: 673 x 439 cells
  Total cells: 295,447

Master Transform:
  | 4997.40, 0.00,-11854083.11|
| 0.00,-4999.91, 5142357.36|
| 0.00, 0.00, 1.00|

Coverage area: 7,386,175.00 km²

MASTER GRID CANVAS READY ✓

Updating geometry lookups with projected coordinates...
Lookup dictionaries updated with projected geometries ✓


In [91]:
import os
from scipy.ndimage import gaussian_filter
from rasterio.features import rasterize
from rasterio.features import geometry_mask
# ==============================================================================
# STEP 2: MAIN RASTERIZATION LOOP - TIME ITERATION
# ==============================================================================

# Create output directories
output_dir = r"C:\Users\colto\Documents\GitHub\Tweet_project\rasters_output"
os.makedirs(output_dir, exist_ok=True)

def process_hurricane(hurricane_name, gdf_proj, interval_counts, time_bins):
    """
    Process a single hurricane through all time bins
    """
    print(f"\n{'=' * 60}")
    print(f"PROCESSING: {hurricane_name.upper()}")
    print(f"{'=' * 60}")
    # Create hurricane-specific output directory
    hurricane_dir = os.path.join(output_dir, hurricane_name.lower())
    os.makedirs(hurricane_dir, exist_ok=True)

    # Initialize cumulative grid (persists across time bins)
    cumulative_grid = np.zeros((grid_params['height'], grid_params['width']), dtype=np.float32)

    # Loop through each time bin chronologically
    for idx, time_bin in enumerate(time_bins):
        print(f"\n--- Time Bin {idx+1}/{len(time_bins)}: {time_bin} ---")

        # Filter data for current time bin
        current_data = interval_counts[interval_counts['unix_timestamp'] == time_bin]
        tweet_count = len(current_data)
        print(f"  Tweets in this bin: {tweet_count}")

        # Initialize incremental grid for this time bin
        incremental_grid = np.zeros((grid_params['height'], grid_params['width']), dtype=np.float32)

        # === PLACEHOLDER FUNCTIONS ===

        # 1. Create State Raster
        state_raster = create_state_raster(current_data, grid_params)
        incremental_grid += state_raster

        # 2. Create County Raster
        county_raster = create_county_raster(current_data, grid_params)
        incremental_grid += county_raster

        # 3. Create City/Point Raster
        city_raster = create_city_raster(current_data, grid_params)
        incremental_grid += city_raster

        # 4. Create Facility/KDE Raster
        facility_raster = create_facility_raster(current_data, grid_params)
        incremental_grid += facility_raster

        # === END PLACEHOLDERS ===

        # Update cumulative grid
        cumulative_grid += incremental_grid

        # Save rasters
        save_raster(incremental_grid, hurricane_dir, hurricane_name, time_bin, 'increment')
        save_raster(cumulative_grid, hurricane_dir, hurricane_name, time_bin, 'cumulative')

        print(f"  Incremental max value: {np.max(incremental_grid):.2f}")
        print(f"  Cumulative max value: {np.max(cumulative_grid):.2f}")

    print(f"\n{hurricane_name.upper()} processing complete!")
    print(f"Output saved to: {hurricane_dir}")
    return

# ==============================================================================
# PLACEHOLDER FUNCTIONS (TO BE IMPLEMENTED)
# ==============================================================================

def create_state_raster(data, grid_params):
    """Rasterize state-level tweets"""
    print("    [STATE] Creating state raster...")

    # Initialize empty raster
    state_grid = np.zeros((grid_params['height'], grid_params['width']), dtype=np.float32)

    # Filter for STATE-level tweets only
    state_data = data[data['scale_level'] == 'STATE']

    if len(state_data) == 0:
        print("      No state-level tweets in this time bin")
        return state_grid

    # Group by state name and sum counts
    state_counts = state_data.groupby('matched_name')['count'].sum()

    print(f"      Processing {len(state_counts)} unique states")

    # Process each state
    for state_name, tweet_count in state_counts.items():
        if state_name in state_lookup_proj:
            # Get the state geometry
            state_geom = state_lookup_proj[state_name]

            # Rasterize the polygon
            # Create a list of (geometry, value) tuples
            shapes = [(state_geom, 1)]

            # Rasterize to temporary grid
            temp_grid = rasterio.features.rasterize(
                shapes=shapes,
                out_shape=(grid_params['height'], grid_params['width']),
                transform=grid_params['transform'],
                fill=0,
                dtype=np.float32,
                all_touched=True  # Include all pixels touched by polygon
            )

            # Multiply by tweet count and add to state grid
            state_grid += temp_grid * tweet_count

            print(f"      - {state_name}: {tweet_count} tweets, {np.sum(temp_grid)} pixels")
        else:
            print(f"      WARNING: State '{state_name}' not found in lookup")

    total_value = np.sum(state_grid)
    max_value = np.max(state_grid)
    print(f"      Total state grid value: {total_value:.0f}, Max pixel: {max_value:.0f}")

    return state_grid

def create_county_raster(data, grid_params):
    """Rasterize county-level tweets with hotspot multiplier"""
    print("    [COUNTY] Creating county raster...")

    # Initialize empty raster
    county_grid = np.zeros((grid_params['height'], grid_params['width']), dtype=np.float32)

    # Filter for COUNTY-level tweets only
    county_data = data[data['scale_level'] == 'COUNTY']

    if len(county_data) == 0:
        print("      No county-level tweets in this time bin")
        return county_grid

    # Group by county name and sum counts
    county_counts = county_data.groupby('matched_name')['count'].sum()

    print(f"      Processing {len(county_counts)} unique counties")

    # HOTSPOT MULTIPLIER for counties
    county_multiplier = 3  # Make counties 3x more prominent

    # Process each county
    for county_name, tweet_count in county_counts.items():
        if county_name in county_lookup_proj:
            # Get the county geometry
            county_geom = county_lookup_proj[county_name]

            # Rasterize the polygon
            shapes = [(county_geom, 1)]

            # Rasterize to temporary grid
            temp_grid = rasterize(
                shapes=shapes,
                out_shape=(grid_params['height'], grid_params['width']),
                transform=grid_params['transform'],
                fill=0,
                dtype=np.float32,
                all_touched=True  # Include all pixels touched by polygon
            )

            # Multiply by tweet count AND multiplier for hotspot effect
            county_grid += temp_grid * tweet_count * county_multiplier

            print(f"      - {county_name}: {tweet_count} tweets × {county_multiplier} = {tweet_count * county_multiplier}, {np.sum(temp_grid)} pixels")
        else:
            print(f"      WARNING: County '{county_name}' not found in lookup")

    total_value = np.sum(county_grid)
    max_value = np.max(county_grid)
    print(f"      Total county grid value: {total_value:.0f}, Max pixel: {max_value:.0f}")

    return county_grid

def create_city_raster(data, grid_params):
    """Rasterize city-level tweets with smaller buffer and hotspot multiplier"""
    print("    [CITY] Creating city raster...")

    # Initialize empty raster
    city_grid = np.zeros((grid_params['height'], grid_params['width']), dtype=np.float32)

    # Filter for CITY-level tweets only
    city_data = data[data['scale_level'] == 'CITY']

    if len(city_data) == 0:
        print("      No city-level tweets in this time bin")
        return city_grid

    # Group by city name and sum counts
    city_counts = city_data.groupby('matched_name')['count'].sum()

    print(f"      Processing {len(city_counts)} unique cities")

    # HOTSPOT PARAMETERS for cities
    buffer_distance = 2500  # Reduced from 5000m to 2.5km for tighter hotspots
    city_multiplier = 5     # Make cities 5x more prominent

    # Process each city
    for city_name, tweet_count in city_counts.items():
        if city_name in cities_lookup:
            # Get the city point geometry
            city_point = cities_lookup[city_name]

            # Need to project the point to match our grid CRS
            # Create a temporary GeoSeries to handle projection
            city_geoseries = gpd.GeoSeries([city_point], crs='EPSG:4326')
            city_point_proj = city_geoseries.to_crs(grid_params['crs']).iloc[0]

            # Create buffer around the point (2.5km)
            city_buffer = city_point_proj.buffer(buffer_distance)

            # Rasterize the buffered polygon
            shapes = [(city_buffer, 1)]

            # Rasterize to temporary grid
            temp_grid = rasterize(
                shapes=shapes,
                out_shape=(grid_params['height'], grid_params['width']),
                transform=grid_params['transform'],
                fill=0,
                dtype=np.float32,
                all_touched=True  # Include all pixels touched by buffer
            )

            # Multiply by tweet count AND multiplier for hotspot effect
            city_grid += temp_grid * tweet_count * city_multiplier

            print(f"      - {city_name}: {tweet_count} tweets × {city_multiplier} = {tweet_count * city_multiplier}, {np.sum(temp_grid)} pixels")
        else:
            print(f"      WARNING: City '{city_name}' not found in lookup")

    total_value = np.sum(city_grid)
    max_value = np.max(city_grid)
    print(f"      Total city grid value: {total_value:.0f}, Max pixel: {max_value:.0f}")

    return city_grid

def create_facility_raster(data, grid_params):
    """Create KDE raster for facility points with strong hotspot multiplier"""
    print("    [FACILITY] Creating facility raster...")

    # Initialize empty raster
    facility_grid = np.zeros((grid_params['height'], grid_params['width']), dtype=np.float32)

    # Filter for FACILITY-level tweets only
    facility_data = data[data['scale_level'] == 'FACILITY']

    if len(facility_data) == 0:
        print("      No facility-level tweets in this time bin")
        return facility_grid

    # Group by facility coordinates (using matched_name as proxy) and sum counts
    facility_counts = facility_data.groupby('matched_name')['count'].sum()

    print(f"      Processing {len(facility_counts)} unique facilities")

    # HOTSPOT PARAMETERS for facilities
    sigma_meters = 2 * grid_params['cell_size']  # 10 km for 5km cells
    sigma_pixels = sigma_meters / grid_params['cell_size']  # Convert to pixel units
    facility_multiplier = 10  # Make facilities 10x more prominent (strongest hotspots)

    # Process each facility
    facilities_processed = 0
    for facility_name, tweet_count in facility_counts.items():
        # Get facility data to extract geometry
        facility_rows = facility_data[facility_data['matched_name'] == facility_name]

        if len(facility_rows) > 0:
            # Get the point geometry (should be from the tweet's geocoded location)
            facility_point = facility_rows.iloc[0]['matched_geom']

            # Project point to grid CRS if needed
            if hasattr(facility_point, 'x') and hasattr(facility_point, 'y'):
                # Create GeoSeries to handle projection
                point_geoseries = gpd.GeoSeries([facility_point], crs='EPSG:4326')
                point_proj = point_geoseries.to_crs(grid_params['crs']).iloc[0]

                # Convert point coordinates to pixel indices
                px = (point_proj.x - grid_params['bounds'][0]) / grid_params['cell_size']
                py = (grid_params['bounds'][3] - point_proj.y) / grid_params['cell_size']

                # Check if point is within grid bounds
                if 0 <= px < grid_params['width'] and 0 <= py < grid_params['height']:
                    # Create point raster with tweet count at location
                    point_grid = np.zeros((grid_params['height'], grid_params['width']), dtype=np.float32)
                    point_grid[int(py), int(px)] = tweet_count

                    # Apply Gaussian filter to create kernel density
                    kernel_grid = gaussian_filter(point_grid, sigma=sigma_pixels, mode='constant', cval=0)

                    # FIXED: Only add once with proper multiplier
                    facility_grid += kernel_grid * facility_multiplier

                    facilities_processed += 1
                    effective_value = tweet_count * facility_multiplier
                    print(f"      - {facility_name}: {tweet_count} tweets × {facility_multiplier} = {effective_value}, KDE at ({point_proj.x:.0f}, {point_proj.y:.0f})")
                else:
                    print(f"      WARNING: Facility '{facility_name}' outside grid bounds")
            else:
                print(f"      WARNING: Invalid geometry for facility '{facility_name}'")

    print(f"      Processed {facilities_processed} facilities with sigma={sigma_pixels:.2f} pixels")

    total_value = np.sum(facility_grid)
    max_value = np.max(facility_grid)
    print(f"      Total facility grid value: {total_value:.2f}, Max pixel: {max_value:.2f}")

    return facility_grid

def save_raster(grid, output_dir, hurricane_name, time_bin, raster_type):
    """Save raster as GeoTIFF"""
    # Convert unix timestamp to readable format for filename
    print('binny', time_bin)
    # time_str = pd.Timestamp(time_bin, unit='ns').strftime('%Y%m%d_%H%M')
    filename = f"{hurricane_name}_{raster_type}_{time_bin}.tif"
    filepath = os.path.join(output_dir, filename)

    with rasterio.open(
        filepath, 'w',
        driver='GTiff',
        height=grid_params['height'],
        width=grid_params['width'],
        count=1,
        dtype=grid.dtype,
        crs=grid_params['crs'],
        transform=grid_params['transform'],
        compress='lzw'
    ) as dst:
        dst.write(grid, 1)

    print(f"    Saved: {filename}")

# ==============================================================================
# EXECUTE PROCESSING FOR BOTH HURRICANES
# ==============================================================================

print("\n" + "=" * 60)
print("STARTING RASTERIZATION PROCESS")
print("=" * 60)

# Process Francine
# process_hurricane('francine', francine_proj, francine_interval_counts, francine_time_bins)

# Process Helene
process_hurricane('helene', helene_proj, helene_interval_counts, helene_time_bins)

print("\n" + "=" * 60)
print("ALL PROCESSING COMPLETE! ✓")
print("=" * 60)


STARTING RASTERIZATION PROCESS

PROCESSING: HELENE

--- Time Bin 1/11: 1727308800 ---
  Tweets in this bin: 11
    [STATE] Creating state raster...
      Processing 2 unique states
      - FLORIDA: 44 tweets, 8114.0 pixels
      - GEORGIA: 1 tweets, 8879.0 pixels
      Total state grid value: 365895, Max pixel: 45
    [COUNTY] Creating county raster...
      Processing 2 unique counties
      - CHARLOTTE: 1 tweets × 3 = 3, 110.0 pixels
      - RALEIGH: 1 tweets × 3 = 3, 130.0 pixels
      Total county grid value: 720, Max pixel: 3
    [CITY] Creating city raster...
      Processing 5 unique cities
      - BLOWING ROCK: 1 tweets × 5 = 5, 3.0 pixels
      - LAWRENCEVILLE: 1 tweets × 5 = 5, 0.0 pixels
      - OCALA: 1 tweets × 5 = 5, 3.0 pixels
      - ORLANDO: 1 tweets × 5 = 5, 4.0 pixels
      - TALLAHASSEE: 1 tweets × 5 = 5, 3.0 pixels
      Total city grid value: 65, Max pixel: 5
    [FACILITY] Creating facility raster...
      Processing 2 unique facilities
      - PORT CANAVERAL, P

In [92]:
# ==============================================================================
# STEP 5: POST-PROCESSING & ASSEMBLY
# ==============================================================================
import glob
# from osgeo import gdal
from rasterio.vrt import WarpedVRT
def create_metadata_index(hurricane_name, hurricane_dir):
    """Create CSV index of all rasters with metadata"""
    print(f"\nCreating metadata index for {hurricane_name}...")

    # Get all increment and cumulative TIFFs
    increment_files = sorted(glob.glob(os.path.join(hurricane_dir, f"{hurricane_name}_increment_*.tif")))
    cumulative_files = sorted(glob.glob(os.path.join(hurricane_dir, f"{hurricane_name}_cumulative_*.tif")))

    metadata_rows = []

    for tif_path in increment_files + cumulative_files:
        filename = os.path.basename(tif_path)

        # Extract time and type from filename
        parts = filename.replace('.tif', '').split('_')
        raster_type = parts[-2]  # 'increment' or 'cumulative'
        time_str = parts[-1]     # e.g., '20240910_0000'

        # Open raster to get stats
        with rasterio.open(tif_path) as src:
            data = src.read(1)

            metadata_rows.append({
                'filename': filename,
                'type': raster_type,
                'time_str': time_str,
                'min_value': np.min(data),
                'max_value': np.max(data),
                'mean_value': np.mean(data),
                'total_value': np.sum(data),
                'non_zero_pixels': np.count_nonzero(data)
            })

    # Create DataFrame and save
    metadata_df = pd.DataFrame(metadata_rows)
    index_path = os.path.join(hurricane_dir, f"{hurricane_name}_index.csv")
    metadata_df.to_csv(index_path, index=False)

    print(f"  Index saved: {index_path}")
    print(f"  Total rasters cataloged: {len(metadata_rows)}")

    return metadata_df


def create_vrt_stacks(hurricane_name, hurricane_dir):
    """Create VRT files using rasterio (no GDAL needed)"""
    print(f"\nCreating VRT stacks for {hurricane_name}...")

    # Simply skip VRT creation or create a text-based reference file
    increment_files = sorted(glob.glob(os.path.join(hurricane_dir, f"{hurricane_name}_increment_*.tif")))

    # Create a simple text list file instead
    list_file = os.path.join(hurricane_dir, f"{hurricane_name}_increment_files.txt")
    with open(list_file, 'w') as f:
        for file in increment_files:
            f.write(file + '\n')

    print(f"  Created file list: {hurricane_name}_increment_files.txt")
    print(f"  Import these files directly in ArcGIS Pro")

def print_summary_stats(hurricane_name, hurricane_dir):
    """Print summary statistics for the hurricane dataset"""
    print(f"\n{'='*60}")
    print(f"SUMMARY: {hurricane_name.upper()}")
    print(f"{'='*60}")

    increment_files = glob.glob(os.path.join(hurricane_dir, f"{hurricane_name}_increment_*.tif"))
    cumulative_files = glob.glob(os.path.join(hurricane_dir, f"{hurricane_name}_cumulative_*.tif"))

    print(f"  Total time slices: {len(increment_files)}")
    print(f"  Increment rasters: {len(increment_files)}")
    print(f"  Cumulative rasters: {len(cumulative_files)}")
    print(f"  Output directory: {hurricane_dir}")

    # Get final cumulative stats
    if cumulative_files:
        final_cumulative = sorted(cumulative_files)[-1]
        with rasterio.open(final_cumulative) as src:
            final_data = src.read(1)
            print(f"\n  Final Cumulative Statistics:")
            print(f"    Total value: {np.sum(final_data):,.0f}")
            print(f"    Max pixel value: {np.max(final_data):,.2f}")
            print(f"    Active pixels: {np.count_nonzero(final_data):,}")

# ==============================================================================
# RUN POST-PROCESSING
# ==============================================================================

print("\n" + "="*60)
print("STEP 5: POST-PROCESSING & ASSEMBLY")
print("="*60)

# Process Francine
francine_dir = os.path.join(output_dir, 'francine')
if os.path.exists(francine_dir):
    francine_metadata = create_metadata_index('francine', francine_dir)
    create_vrt_stacks('francine', francine_dir)
    print_summary_stats('francine', francine_dir)

# Process Helene
helene_dir = os.path.join(output_dir, 'helene')
if os.path.exists(helene_dir):
    helene_metadata = create_metadata_index('helene', helene_dir)
    create_vrt_stacks('helene', helene_dir)
    print_summary_stats('helene', helene_dir)

print("\n" + "="*60)
print("POST-PROCESSING COMPLETE! ✓")
print("="*60)
print("\nNext Steps:")
print("1. Open ArcGIS Pro")
print("2. Add Multidimensional Raster Layer")
print("3. Point to the output folders")
print("4. Enable time slider for animation")


STEP 5: POST-PROCESSING & ASSEMBLY

Creating metadata index for francine...
  Index saved: C:\Users\colto\Documents\GitHub\Tweet_project\rasters_output\francine\francine_index.csv
  Total rasters cataloged: 88

Creating VRT stacks for francine...
  Created file list: francine_increment_files.txt
  Import these files directly in ArcGIS Pro

SUMMARY: FRANCINE
  Total time slices: 44
  Increment rasters: 44
  Cumulative rasters: 44
  Output directory: C:\Users\colto\Documents\GitHub\Tweet_project\rasters_output\francine

  Final Cumulative Statistics:
    Total value: 0
    Max pixel value: 0.00
    Active pixels: 0

Creating metadata index for helene...
  Index saved: C:\Users\colto\Documents\GitHub\Tweet_project\rasters_output\helene\helene_index.csv
  Total rasters cataloged: 22

Creating VRT stacks for helene...
  Created file list: helene_increment_files.txt
  Import these files directly in ArcGIS Pro

SUMMARY: HELENE
  Total time slices: 11
  Increment rasters: 11
  Cumulative rast