# ARCGIS TWEET PROCESSOR - COMPLETE CONVERSION

## Overview
This notebook converts **ALL** functionality from `test.ipynb` to work entirely within an ArcGIS Pro/arcpy environment.

### What This Notebook Does:
1. Load tweets (GeoJSON), cities (CSV), states/counties (shapefiles)
2. Parse GPE field (comma-separated) and match place names **EXACTLY** (case-insensitive)
3. Count mentions by entity with **PRIORITY HIERARCHY**: State > County > City
4. Spatial cascade from tweet point locations
5. Create temporal bins (4-hour intervals)
6. Export incremental and cumulative shapefiles

### Matching Logic:
- **Exact matching only** - no fuzzy matching
- Matches state names (full or abbreviation), case-insensitive
- **Priority hierarchy**: If a string matches a state, it cannot also match a county/city
- Comma-separated strings are parsed individually

### Requirements:
- **ArcGIS Pro 3.x** with arcpy
- **No external dependencies** - pure Python + arcpy

---

## Cell 1: Imports and Environment Setup

In [41]:
"""
IMPORTS AND ENVIRONMENT SETUP
"""
import arcpy
import os
import re
from datetime import datetime
from collections import defaultdict

# Set overwrite output
arcpy.env.overwriteOutput = True

print("✓ Imports loaded")
print(f"ArcPy version: {arcpy.GetInstallInfo()['Version']}")
print(f"Working directory: {os.getcwd()}")
print("\nMatching method: EXACT (case-insensitive)")
print("Priority hierarchy: State > County > City")

✓ Imports loaded
ArcPy version: 3.5
Working directory: C:\WINDOWS\system32

Matching method: EXACT (case-insensitive)
Priority hierarchy: State > County > City


## Cell 2: Path Management Functions

In [42]:
"""
PATH MANAGEMENT FUNCTIONS
"""

import os

def get_project_root():
    """Get project root directory"""
    # Hardcode project path to avoid working directory issues
    return r"C:\users\colto\documents\github\tweet_project"


def get_data_file_path(*path_segments):
    """Build path to data files from project root"""
    project_root = get_project_root()
    return os.path.join(project_root, *path_segments)


print("✓ Path management functions loaded (hardcoded project path).")

✓ Path management functions loaded (hardcoded project path).


## Cell 3: Data Loading Functions (ArcPy)

In [43]:
"""
DATA LOADING FUNCTIONS
GeoJSON from local path, everything else from GDB
"""

import json

def load_tweets_geojson(workspace="in_memory"):
    """
    Load helene.geojson from LOCAL PATH by reading JSON and creating points
    Returns: Feature class path
    """
    print("Loading tweets from helene.geojson...")
    geojson_path = os.path.join(os.path.abspath(workspace), "data", "geojson", "helene.geojson")
    if not os.path.exists(geojson_path):
        raise FileNotFoundError(f"GeoJSON file not found: {geojson_path}")
    print(f"  Path: {geojson_path}")

    # Read GeoJSON file
    with open(geojson_path, 'r') as f:
        geojson_data = json.load(f)

    # Create feature class
    tweets_fc = os.path.join(workspace, "tweets_helene")

    # Create empty point feature class
    arcpy.management.CreateFeatureclass(
        os.path.dirname(tweets_fc) if workspace != "in_memory" else workspace,
        os.path.basename(tweets_fc),
        "POINT",
        spatial_reference=arcpy.SpatialReference(4326)
    )

    # Add fields for attributes
    arcpy.management.AddField(tweets_fc, "GPE", "TEXT", field_length=500)
    arcpy.management.AddField(tweets_fc, "FAC", "TEXT", field_length=500)
    arcpy.management.AddField(tweets_fc, "LOC", "TEXT", field_length=500)
    arcpy.management.AddField(tweets_fc, "time", "TEXT", field_length=50)
    arcpy.management.AddField(tweets_fc, "Latitude", "DOUBLE")
    arcpy.management.AddField(tweets_fc, "Longitude", "DOUBLE")

    # Insert features
    fields = ['SHAPE@XY', 'GPE', 'FAC', 'LOC', 'time', 'Latitude', 'Longitude']

    with arcpy.da.InsertCursor(tweets_fc, fields) as cursor:
        for feature in geojson_data['features']:
            props = feature['properties']
            coords = feature['geometry']['coordinates']

            # GeoJSON is [lon, lat]
            lon = coords[0]
            lat = coords[1]

            cursor.insertRow([
                (lon, lat),  # SHAPE@XY
                props.get('GPE', ''),
                props.get('FAC', ''),
                props.get('LOC', ''),
                props.get('time', ''),
                lat,
                lon
            ])

    count = int(arcpy.management.GetCount(tweets_fc).getOutput(0))
    print(f"  ✓ Loaded {count} tweet features")

    return tweets_fc


def load_cities_csv(workspace="in_memory"):
    """Load cities from GDB feature class"""
    print("Loading cities from GDB...")
    
    cities_fc = os.path.join(workspace, "us_cities")
    arcpy.management.CopyFeatures("cities1000", cities_fc)
    
    count = int(arcpy.management.GetCount(cities_fc).getOutput(0))
    print(f"  ✓ Loaded {count} city features")
    
    return cities_fc


def load_states_shapefile(workspace="in_memory"):
    """Load states from GDB feature class"""
    print("Loading states from GDB...")
    
    states_fc = os.path.join(workspace, "us_states")
    arcpy.management.CopyFeatures("cb_2023_us_state_20m", states_fc)
    
    count = int(arcpy.management.GetCount(states_fc).getOutput(0))
    print(f"  ✓ Loaded {count} state features")
    
    return states_fc


def load_counties_shapefile(workspace="in_memory"):
    """Load counties from GDB feature class"""
    print("Loading counties from GDB...")
    
    counties_fc = os.path.join(workspace, "us_counties")
    arcpy.management.CopyFeatures("cb_2023_us_county_20m", counties_fc)
    
    count = int(arcpy.management.GetCount(counties_fc).getOutput(0))
    print(f"  ✓ Loaded {count} county features")
    
    return counties_fc


print("✓ Data loading functions defined (GeoJSON from local, others from GDB).")

✓ Data loading functions defined (GeoJSON from local, others from GDB).


## Cell 4: Place Name Preprocessing and Parsing

In [44]:
"""
PLACE NAME PREPROCESSING AND PARSING
Exact matching only - case insensitive
"""

def normalize_place_name(name):
    """
    Normalize place names for EXACT matching (case-insensitive)
    Simply uppercase and strip whitespace
    """
    if not name or name == 'NAN' or name == '':
        return None
    
    name = str(name).upper().strip()
    
    return name if name else None


def parse_gpe_entities(gpe_string):
    """
    Split GPE field by commas into individual place mentions
    Returns list of normalized place names
    """
    if not gpe_string or str(gpe_string).strip() == '':
        return []
    
    gpe_string = str(gpe_string).strip()
    entities = []
    
    # Split by comma
    for part in gpe_string.split(','):
        part = part.strip()
        if not part:
            continue
        
        normalized = normalize_place_name(part)
        if normalized and len(normalized) > 0:
            entities.append(normalized)
    
    # Remove duplicates while preserving order
    seen, clean = set(), []
    for e in entities:
        if e not in seen:
            clean.append(e)
            seen.add(e)
    
    return clean


print("✓ Place name preprocessing functions defined.")

✓ Place name preprocessing functions defined.


## Cell 5: Create Lookup Dictionaries with Priority Hierarchy

In [45]:
"""
CREATE LOOKUP DICTIONARIES FROM FEATURE CLASSES
Priority: State > County > City
"""

def create_lookup_dictionaries(states_fc, counties_fc, cities_fc):
    """
    Build name->attributes lookup dictionaries from feature classes
    
    States: Indexed by full name (uppercase) AND abbreviation (uppercase)
    Counties: Indexed by name (uppercase)
    Cities: Indexed by name (uppercase)
    
    Returns: state_lookup, county_lookup, city_lookup
    """
    print("Building lookup dictionaries...")
    
    state_lookup = {}
    county_lookup = {}
    city_lookup = {}
    
    # Build states lookup (both full name and abbreviation)
    print("  Building state lookup...")
    with arcpy.da.SearchCursor(states_fc, ['NAME', 'STUSPS', 'STATEFP', 'SHAPE@']) as cursor:
        for row in cursor:
            state_name = normalize_place_name(row[0])
            state_abbrev = normalize_place_name(row[1])
            
            state_data = {
                'NAME': row[0],
                'STUSPS': row[1],
                'STATEFP': row[2],
                'geometry': row[3]
            }
            
            # Index by full name
            if state_name:
                state_lookup[state_name] = state_data
            
            # Index by abbreviation
            if state_abbrev:
                state_lookup[state_abbrev] = state_data
    
    # Build counties lookup
    print("  Building county lookup...")
    with arcpy.da.SearchCursor(counties_fc, ['NAME', 'GEOID', 'STATEFP', 'SHAPE@']) as cursor:
        for row in cursor:
            county_name = normalize_place_name(row[0])
            if county_name:
                county_lookup[county_name] = {
                    'NAME': row[0],
                    'GEOID': row[1],
                    'STATEFP': row[2],
                    'geometry': row[3]
                }
    
    # Build cities lookup
    print("  Building city lookup...")
    with arcpy.da.SearchCursor(cities_fc, ['name', 'geonameid', 'population', 'SHAPE@']) as cursor:
        for row in cursor:
            city_name = normalize_place_name(row[0])
            if city_name:
                city_lookup[city_name] = {
                    'name': row[0],
                    'geonameid': row[1],
                    'population': row[2],
                    'geometry': row[3]
                }
    
    print(f"  ✓ States: {len(state_lookup)} entries (names + abbreviations)")
    print(f"  ✓ Counties: {len(county_lookup)} entries")
    print(f"  ✓ Cities: {len(city_lookup)} entries")
    
    return state_lookup, county_lookup, city_lookup


print("✓ Lookup dictionary function defined.")

✓ Lookup dictionary function defined.


## Cell 6: Exact Matching with Priority Hierarchy

In [46]:
"""
EXACT MATCHING WITH PRIORITY HIERARCHY
Priority: State > County > City
"""

def match_entity_with_priority(entity, state_lookup, county_lookup, city_lookup):
    """
    Match entity to geographic level with priority hierarchy.
    
    Priority:
    1. State (full name or abbreviation)
    2. County (only if NOT a state)
    3. City (only if NOT a state or county)
    
    Returns: (match_data, match_type) where match_type is 'state', 'county', 'city', or None
    """
    # Priority 1: Check if it's a state
    if entity in state_lookup:
        return state_lookup[entity], 'state'
    
    # Priority 2: Check if it's a county (only if not a state)
    if entity in county_lookup:
        return county_lookup[entity], 'county'
    
    # Priority 3: Check if it's a city (only if not a state or county)
    if entity in city_lookup:
        return city_lookup[entity], 'city'
    
    # No match found
    return None, None


print("✓ Priority matching function defined.")

✓ Priority matching function defined.


## Cell 7: Count Mentions with Hierarchical Cascade and Temporal Binning

In [47]:
"""
COUNT MENTIONS WITH HIERARCHICAL CASCADE - TEMPORAL
This is the core logic with priority hierarchy and spatial cascade
"""

def count_mentions_with_cascade_temporal(tweets_fc, state_lookup, county_lookup, city_lookup,
                                         states_fc, counties_fc, cities_fc):
    """
    Count mentions by time bin WITH hierarchical cascade
    
    TEXT-BASED MATCHING (from GPE field):
    - Priority: State > County > City (exclusive)
    - Exact match only (case-insensitive)
    - Comma-separated parsing
    
    SPATIAL CASCADE (from tweet point):
    - Each tweet point finds its containing county → +1 to county
    - Each county cascades to its state → +1 to state
    - Each tweet point finds nearest city (within 50km) → +1 to city
    
    Returns: time_bins, temporal_state_mentions, temporal_county_mentions, temporal_city_mentions,
             + detail dictionaries
    """
    print("\nCounting tweet mentions by time bin WITH HIERARCHICAL CASCADE...")
    
    # Get all tweets with their attributes
    tweets_data = []
    fields = ['SHAPE@', 'GPE', 'time']
    
    with arcpy.da.SearchCursor(tweets_fc, fields) as cursor:
        for row in cursor:
            tweets_data.append({
                'geometry': row[0],
                'GPE': row[1],
                'time': row[2]
            })
    
    print(f"  Processing {len(tweets_data)} tweets...")
    
    # Parse time and create 4-hour bins
    for tweet in tweets_data:
        time_str = str(tweet['time'])
        # Handle datetime parsing
        try:
            dt = datetime.fromisoformat(time_str.replace('+00:00', ''))
        except:
            try:
                dt = datetime.strptime(time_str.split('+')[0], '%Y-%m-%d %H:%M:%S')
            except:
                dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
        
        # Floor to 4-hour bin
        hour_bin = (dt.hour // 4) * 4
        tweet['bin'] = dt.replace(hour=hour_bin, minute=0, second=0, microsecond=0)
        tweet['time_dt'] = dt
    
    # Get unique time bins
    time_bins = sorted(set([t['bin'] for t in tweets_data]))
    print(f"  Time bins: {len(time_bins)}")
    
    # Initialize dictionaries for each time bin
    temporal_state_mentions = {tb: {} for tb in time_bins}
    temporal_county_mentions = {tb: {} for tb in time_bins}
    temporal_city_mentions = {tb: {} for tb in time_bins}
    
    temporal_state_details = {tb: {} for tb in time_bins}
    temporal_county_details = {tb: {} for tb in time_bins}
    temporal_city_details = {tb: {} for tb in time_bins}
    
    # Create spatial index for counties
    print("  Creating county spatial index...")
    county_geoms = {}
    with arcpy.da.SearchCursor(counties_fc, ['GEOID', 'STATEFP', 'NAME', 'SHAPE@']) as cursor:
        for row in cursor:
            county_geoms[row[0]] = {
                'geoid': row[0],
                'statefp': row[1],
                'name': row[2],
                'geometry': row[3]
            }
    
    # Create spatial index for states
    print("  Creating state spatial index...")
    state_geoms = {}
    with arcpy.da.SearchCursor(states_fc, ['STUSPS', 'NAME', 'STATEFP', 'SHAPE@']) as cursor:
        for row in cursor:
            state_geoms[row[2]] = {  # Key by STATEFP
                'stusps': row[0],
                'name': row[1],
                'statefp': row[2],
                'geometry': row[3]
            }
    
    # Create spatial index for cities
    print("  Creating city spatial index...")
    city_geoms = {}
    with arcpy.da.SearchCursor(cities_fc, ['geonameid', 'name', 'SHAPE@']) as cursor:
        for row in cursor:
            city_geoms[row[0]] = {
                'geonameid': row[0],
                'name': row[1],
                'geometry': row[2]
            }
    
    # Process each tweet
    print("  Processing tweets with priority matching...")
    for idx, tweet in enumerate(tweets_data):
        if idx % 100 == 0:
            print(f"    Tweet {idx}/{len(tweets_data)}")
        
        time_bin = tweet['bin']
        entities = parse_gpe_entities(tweet['GPE'])
        original_gpe = str(tweet['GPE']) if tweet['GPE'] else ''
        tweet_time = str(tweet['time'])
        tweet_point = tweet['geometry']
        
        # === PART 1: COUNT MENTIONS (text-based with PRIORITY) ===
        for entity in entities:
            match_data, match_type = match_entity_with_priority(entity, state_lookup, county_lookup, city_lookup)
            
            if match_type == 'state':
                state_code = match_data['STUSPS']
                temporal_state_mentions[time_bin][state_code] = temporal_state_mentions[time_bin].get(state_code, 0) + 1
                
                if state_code not in temporal_state_details[time_bin]:
                    temporal_state_details[time_bin][state_code] = []
                temporal_state_details[time_bin][state_code].append({
                    'original_gpe': original_gpe,
                    'matched_entity': entity,
                    'time': tweet_time
                })
            
            elif match_type == 'county':
                county_id = match_data['GEOID']
                temporal_county_mentions[time_bin][county_id] = temporal_county_mentions[time_bin].get(county_id, 0) + 1
                
                if county_id not in temporal_county_details[time_bin]:
                    temporal_county_details[time_bin][county_id] = []
                temporal_county_details[time_bin][county_id].append({
                    'original_gpe': original_gpe,
                    'matched_entity': entity,
                    'time': tweet_time
                })
            
            elif match_type == 'city':
                city_id = match_data['geonameid']
                temporal_city_mentions[time_bin][city_id] = temporal_city_mentions[time_bin].get(city_id, 0) + 1
                
                if city_id not in temporal_city_details[time_bin]:
                    temporal_city_details[time_bin][city_id] = []
                temporal_city_details[time_bin][city_id].append({
                    'original_gpe': original_gpe,
                    'matched_entity': entity,
                    'time': tweet_time
                })
        
        # === PART 2: CASCADE FROM TWEET POINT (spatial-based) ===
        # Find containing county
        containing_county = None
        for county_id, county_data in county_geoms.items():
            if county_data['geometry'].contains(tweet_point):
                containing_county = county_data
                break
        
        if containing_county:
            county_geoid = containing_county['geoid']
            county_statefp = containing_county['statefp']
            county_name = containing_county['name']
            
            # CASCADE: Increment county count
            temporal_county_mentions[time_bin][county_geoid] = temporal_county_mentions[time_bin].get(county_geoid, 0) + 1
            
            if county_geoid not in temporal_county_details[time_bin]:
                temporal_county_details[time_bin][county_geoid] = []
            temporal_county_details[time_bin][county_geoid].append({
                'original_gpe': f'[CASCADE from point in {county_name}]',
                'matched_entity': f'{county_name} County',
                'time': tweet_time
            })
            
            # CASCADE: Find containing state
            if county_statefp in state_geoms:
                state_data = state_geoms[county_statefp]
                state_code = state_data['stusps']
                state_name = state_data['name']
                
                # CASCADE: Increment state count
                temporal_state_mentions[time_bin][state_code] = temporal_state_mentions[time_bin].get(state_code, 0) + 1
                
                if state_code not in temporal_state_details[time_bin]:
                    temporal_state_details[time_bin][state_code] = []
                temporal_state_details[time_bin][state_code].append({
                    'original_gpe': f'[CASCADE from point in {state_name}]',
                    'matched_entity': state_name,
                    'time': tweet_time
                })
        
        # CASCADE: Find nearest city (within 50km)
        # Buffer point by ~50km (0.45 degrees)
        tweet_buffer = tweet_point.buffer(0.45)
        
        nearest_city = None
        min_distance = float('inf')
        
        for city_id, city_data in city_geoms.items():
            if tweet_buffer.contains(city_data['geometry']):
                distance = tweet_point.distanceTo(city_data['geometry'])
                if distance < min_distance:
                    min_distance = distance
                    nearest_city = city_data
        
        if nearest_city:
            city_id = nearest_city['geonameid']
            city_name = nearest_city['name']
            
            # CASCADE: Increment city count
            temporal_city_mentions[time_bin][city_id] = temporal_city_mentions[time_bin].get(city_id, 0) + 1
            
            if city_id not in temporal_city_details[time_bin]:
                temporal_city_details[time_bin][city_id] = []
            temporal_city_details[time_bin][city_id].append({
                'original_gpe': '[CASCADE from nearby point]',
                'matched_entity': city_name,
                'time': tweet_time
            })
    
    print(f"\n  ✓ Found mentions across {len(time_bins)} time bins")
    
    return (time_bins, temporal_state_mentions, temporal_county_mentions, temporal_city_mentions,
            temporal_state_details, temporal_county_details, temporal_city_details)


print("✓ Counting function defined.")

✓ Counting function defined.


## Cell 8: Export Helper Functions

In [48]:
"""
HELPER FUNCTIONS FOR EXPORTING INDIVIDUAL ENTITY TYPES
"""

def export_states_incremental(states_fc, mention_counts, mention_details, bin_label, output_shp):
    """Export incremental state counts to shapefile"""
    arcpy.management.CreateFeatureclass(
        os.path.dirname(output_shp),
        os.path.basename(output_shp),
        "POLYGON",
        spatial_reference=arcpy.SpatialReference(4326)
    )
    
    arcpy.management.AddField(output_shp, "state_name", "TEXT", field_length=100)
    arcpy.management.AddField(output_shp, "state_code", "TEXT", field_length=2)
    arcpy.management.AddField(output_shp, "tweet_cnt", "LONG")
    arcpy.management.AddField(output_shp, "smpl_gpe", "TEXT", field_length=254)
    arcpy.management.AddField(output_shp, "time_bin", "TEXT", field_length=50)
    
    with arcpy.da.InsertCursor(output_shp, ['SHAPE@', 'state_name', 'state_code', 'tweet_cnt', 'smpl_gpe', 'time_bin']) as ins_cursor:
        with arcpy.da.SearchCursor(states_fc, ['NAME', 'STUSPS', 'SHAPE@']) as cursor:
            for row in cursor:
                state_code = row[1]
                if state_code in mention_counts:
                    sample_gpe = ' | '.join([d['original_gpe'][:100] for d in mention_details[state_code][:3]])
                    ins_cursor.insertRow([
                        row[2],
                        row[0],
                        state_code,
                        mention_counts[state_code],
                        sample_gpe[:254],
                        bin_label
                    ])


def export_states_cumulative(states_fc, cumulative_counts, bin_label, output_shp):
    """Export cumulative state counts to shapefile"""
    arcpy.management.CreateFeatureclass(
        os.path.dirname(output_shp),
        os.path.basename(output_shp),
        "POLYGON",
        spatial_reference=arcpy.SpatialReference(4326)
    )
    
    arcpy.management.AddField(output_shp, "state_name", "TEXT", field_length=100)
    arcpy.management.AddField(output_shp, "state_code", "TEXT", field_length=2)
    arcpy.management.AddField(output_shp, "cumul_cnt", "LONG")
    arcpy.management.AddField(output_shp, "time_bin", "TEXT", field_length=50)
    
    with arcpy.da.InsertCursor(output_shp, ['SHAPE@', 'state_name', 'state_code', 'cumul_cnt', 'time_bin']) as ins_cursor:
        with arcpy.da.SearchCursor(states_fc, ['NAME', 'STUSPS', 'SHAPE@']) as cursor:
            for row in cursor:
                state_code = row[1]
                if state_code in cumulative_counts:
                    ins_cursor.insertRow([
                        row[2],
                        row[0],
                        state_code,
                        cumulative_counts[state_code],
                        bin_label
                    ])


def export_counties_incremental(counties_fc, mention_counts, mention_details, bin_label, output_shp):
    """Export incremental county counts to shapefile"""
    arcpy.management.CreateFeatureclass(
        os.path.dirname(output_shp),
        os.path.basename(output_shp),
        "POLYGON",
        spatial_reference=arcpy.SpatialReference(4326)
    )
    
    arcpy.management.AddField(output_shp, "cnty_name", "TEXT", field_length=100)
    arcpy.management.AddField(output_shp, "cnty_id", "TEXT", field_length=5)
    arcpy.management.AddField(output_shp, "state_fp", "TEXT", field_length=2)
    arcpy.management.AddField(output_shp, "tweet_cnt", "LONG")
    arcpy.management.AddField(output_shp, "smpl_gpe", "TEXT", field_length=254)
    arcpy.management.AddField(output_shp, "time_bin", "TEXT", field_length=50)
    
    with arcpy.da.InsertCursor(output_shp, ['SHAPE@', 'cnty_name', 'cnty_id', 'state_fp', 'tweet_cnt', 'smpl_gpe', 'time_bin']) as ins_cursor:
        with arcpy.da.SearchCursor(counties_fc, ['NAME', 'GEOID', 'STATEFP', 'SHAPE@']) as cursor:
            for row in cursor:
                county_id = row[1]
                if county_id in mention_counts:
                    sample_gpe = ' | '.join([d['original_gpe'][:100] for d in mention_details[county_id][:3]])
                    ins_cursor.insertRow([
                        row[3],
                        row[0],
                        county_id,
                        row[2],
                        mention_counts[county_id],
                        sample_gpe[:254],
                        bin_label
                    ])


def export_counties_cumulative(counties_fc, cumulative_counts, bin_label, output_shp):
    """Export cumulative county counts to shapefile"""
    arcpy.management.CreateFeatureclass(
        os.path.dirname(output_shp),
        os.path.basename(output_shp),
        "POLYGON",
        spatial_reference=arcpy.SpatialReference(4326)
    )
    
    arcpy.management.AddField(output_shp, "cnty_name", "TEXT", field_length=100)
    arcpy.management.AddField(output_shp, "cnty_id", "TEXT", field_length=5)
    arcpy.management.AddField(output_shp, "state_fp", "TEXT", field_length=2)
    arcpy.management.AddField(output_shp, "cumul_cnt", "LONG")
    arcpy.management.AddField(output_shp, "time_bin", "TEXT", field_length=50)
    
    with arcpy.da.InsertCursor(output_shp, ['SHAPE@', 'cnty_name', 'cnty_id', 'state_fp', 'cumul_cnt', 'time_bin']) as ins_cursor:
        with arcpy.da.SearchCursor(counties_fc, ['NAME', 'GEOID', 'STATEFP', 'SHAPE@']) as cursor:
            for row in cursor:
                county_id = row[1]
                if county_id in cumulative_counts:
                    ins_cursor.insertRow([
                        row[3],
                        row[0],
                        county_id,
                        row[2],
                        cumulative_counts[county_id],
                        bin_label
                    ])


def export_cities_incremental(cities_fc, mention_counts, mention_details, bin_label, output_shp):
    """Export incremental city counts to shapefile"""
    arcpy.management.CreateFeatureclass(
        os.path.dirname(output_shp),
        os.path.basename(output_shp),
        "POINT",
        spatial_reference=arcpy.SpatialReference(4326)
    )
    
    arcpy.management.AddField(output_shp, "city_name", "TEXT", field_length=200)
    arcpy.management.AddField(output_shp, "city_id", "LONG")
    arcpy.management.AddField(output_shp, "population", "LONG")
    arcpy.management.AddField(output_shp, "tweet_cnt", "LONG")
    arcpy.management.AddField(output_shp, "mtchd_ent", "TEXT", field_length=254)
    arcpy.management.AddField(output_shp, "orig_gpe", "TEXT", field_length=254)
    arcpy.management.AddField(output_shp, "time_bin", "TEXT", field_length=50)
    
    with arcpy.da.InsertCursor(output_shp, ['SHAPE@', 'city_name', 'city_id', 'population', 'tweet_cnt', 'mtchd_ent', 'orig_gpe', 'time_bin']) as ins_cursor:
        with arcpy.da.SearchCursor(cities_fc, ['name', 'geonameid', 'population', 'SHAPE@']) as cursor:
            for row in cursor:
                city_id = row[1]
                if city_id in mention_counts:
                    matched_entities = '; '.join([d['matched_entity'] for d in mention_details[city_id]])
                    orig_gpe = ' | '.join([d['original_gpe'] for d in mention_details[city_id]])
                    
                    ins_cursor.insertRow([
                        row[3],
                        row[0],
                        city_id,
                        row[2],
                        mention_counts[city_id],
                        matched_entities[:254],
                        orig_gpe[:254],
                        bin_label
                    ])


def export_cities_cumulative(cities_fc, cumulative_counts, bin_label, output_shp):
    """Export cumulative city counts to shapefile"""
    arcpy.management.CreateFeatureclass(
        os.path.dirname(output_shp),
        os.path.basename(output_shp),
        "POINT",
        spatial_reference=arcpy.SpatialReference(4326)
    )
    
    arcpy.management.AddField(output_shp, "city_name", "TEXT", field_length=200)
    arcpy.management.AddField(output_shp, "city_id", "LONG")
    arcpy.management.AddField(output_shp, "population", "LONG")
    arcpy.management.AddField(output_shp, "cumul_cnt", "LONG")
    arcpy.management.AddField(output_shp, "time_bin", "TEXT", field_length=50)
    
    with arcpy.da.InsertCursor(output_shp, ['SHAPE@', 'city_name', 'city_id', 'population', 'cumul_cnt', 'time_bin']) as ins_cursor:
        with arcpy.da.SearchCursor(cities_fc, ['name', 'geonameid', 'population', 'SHAPE@']) as cursor:
            for row in cursor:
                city_id = row[1]
                if city_id in cumulative_counts:
                    ins_cursor.insertRow([
                        row[3],
                        row[0],
                        city_id,
                        row[2],
                        cumulative_counts[city_id],
                        bin_label
                    ])


print("✓ Export helper functions defined.")

✓ Export helper functions defined.


## Cell 9: Main Export Function

In [49]:
"""
EXPORT TEMPORAL DATA TO SHAPEFILES
"""

def export_temporal_to_shapefiles(time_bins, temporal_state_mentions, temporal_county_mentions,
                                  temporal_city_mentions, temporal_state_details, temporal_county_details,
                                  temporal_city_details, states_fc, counties_fc, cities_fc,
                                  output_dir='arcgis_outputs'):
    """
    Export temporal (4-hour binned) data for states, counties, and cities.
    Creates BOTH incremental and cumulative count shapefiles
    """
    
    temporal_dir = os.path.join(output_dir, 'temporal_4hour_bins')
    incremental_dir = os.path.join(temporal_dir, 'incremental')
    cumulative_dir = os.path.join(temporal_dir, 'cumulative')
    
    os.makedirs(incremental_dir, exist_ok=True)
    os.makedirs(cumulative_dir, exist_ok=True)
    
    print(f"\n{'='*60}")
    print("EXPORTING TEMPORAL DATA - INCREMENTAL & CUMULATIVE")
    print("="*60)
    print(f"\nTime bins: {len(time_bins)}")
    print(f"Output directory: {temporal_dir}")
    
    cumulative_state_counts = {}
    cumulative_county_counts = {}
    cumulative_city_counts = {}
    
    incremental_bin_files = {'states': [], 'counties': [], 'cities': []}
    cumulative_bin_files = {'states': [], 'counties': [], 'cities': []}
    
    for idx, bin_time in enumerate(time_bins):
        bin_str = bin_time.strftime('%Y%m%d_%H%M')
        bin_label = bin_time.strftime('%Y-%m-%d %H:%M:%S')
        
        print(f"\n  Processing time bin {idx+1}/{len(time_bins)}: {bin_label}")
        
        # Update cumulative counts
        for state_code, count in temporal_state_mentions[bin_time].items():
            cumulative_state_counts[state_code] = cumulative_state_counts.get(state_code, 0) + count
        
        for county_id, count in temporal_county_mentions[bin_time].items():
            cumulative_county_counts[county_id] = cumulative_county_counts.get(county_id, 0) + count
        
        for city_id, count in temporal_city_mentions[bin_time].items():
            cumulative_city_counts[city_id] = cumulative_city_counts.get(city_id, 0) + count
        
        # STATES
        if temporal_state_mentions[bin_time]:
            states_inc_shp = os.path.join(incremental_dir, f'states_inc_{bin_str}.shp')
            export_states_incremental(states_fc, temporal_state_mentions[bin_time],
                                     temporal_state_details[bin_time], bin_label, states_inc_shp)
            incremental_bin_files['states'].append(states_inc_shp)
            print(f"    States incremental: {int(arcpy.management.GetCount(states_inc_shp).getOutput(0))} features")
        
        states_cum_shp = os.path.join(cumulative_dir, f'states_cum_{bin_str}.shp')
        export_states_cumulative(states_fc, cumulative_state_counts, bin_label, states_cum_shp)
        cumulative_bin_files['states'].append(states_cum_shp)
        print(f"    States cumulative: {int(arcpy.management.GetCount(states_cum_shp).getOutput(0))} features")
        
        # COUNTIES
        if temporal_county_mentions[bin_time]:
            counties_inc_shp = os.path.join(incremental_dir, f'counties_inc_{bin_str}.shp')
            export_counties_incremental(counties_fc, temporal_county_mentions[bin_time],
                                       temporal_county_details[bin_time], bin_label, counties_inc_shp)
            incremental_bin_files['counties'].append(counties_inc_shp)
            print(f"    Counties incremental: {int(arcpy.management.GetCount(counties_inc_shp).getOutput(0))} features")
        
        counties_cum_shp = os.path.join(cumulative_dir, f'counties_cum_{bin_str}.shp')
        export_counties_cumulative(counties_fc, cumulative_county_counts, bin_label, counties_cum_shp)
        cumulative_bin_files['counties'].append(counties_cum_shp)
        print(f"    Counties cumulative: {int(arcpy.management.GetCount(counties_cum_shp).getOutput(0))} features")
        
        # CITIES
        if temporal_city_mentions[bin_time]:
            cities_inc_shp = os.path.join(incremental_dir, f'cities_inc_{bin_str}.shp')
            export_cities_incremental(cities_fc, temporal_city_mentions[bin_time],
                                     temporal_city_details[bin_time], bin_label, cities_inc_shp)
            incremental_bin_files['cities'].append(cities_inc_shp)
            print(f"    Cities incremental: {int(arcpy.management.GetCount(cities_inc_shp).getOutput(0))} features")
        
        cities_cum_shp = os.path.join(cumulative_dir, f'cities_cum_{bin_str}.shp')
        export_cities_cumulative(cities_fc, cumulative_city_counts, bin_label, cities_cum_shp)
        cumulative_bin_files['cities'].append(cities_cum_shp)
        print(f"    Cities cumulative: {int(arcpy.management.GetCount(cities_cum_shp).getOutput(0))} features")
    
    # Merge files
    print(f"\n  Creating master files by merging shapefiles...")
    
    if incremental_bin_files['states']:
        arcpy.management.Merge(incremental_bin_files['states'], os.path.join(incremental_dir, 'states_INCREMENTAL_ALL.shp'))
        print(f"    ✓ States incremental master")
    
    if cumulative_bin_files['states']:
        arcpy.management.Merge(cumulative_bin_files['states'], os.path.join(cumulative_dir, 'states_CUMULATIVE_ALL.shp'))
        print(f"    ✓ States cumulative master")
    
    if incremental_bin_files['counties']:
        arcpy.management.Merge(incremental_bin_files['counties'], os.path.join(incremental_dir, 'counties_INCREMENTAL_ALL.shp'))
        print(f"    ✓ Counties incremental master")
    
    if cumulative_bin_files['counties']:
        arcpy.management.Merge(cumulative_bin_files['counties'], os.path.join(cumulative_dir, 'counties_CUMULATIVE_ALL.shp'))
        print(f"    ✓ Counties cumulative master")
    
    if incremental_bin_files['cities']:
        arcpy.management.Merge(incremental_bin_files['cities'], os.path.join(incremental_dir, 'cities_INCREMENTAL_ALL.shp'))
        print(f"    ✓ Cities incremental master")
    
    if cumulative_bin_files['cities']:
        arcpy.management.Merge(cumulative_bin_files['cities'], os.path.join(cumulative_dir, 'cities_CUMULATIVE_ALL.shp'))
        print(f"    ✓ Cities cumulative master")
    
    print(f"\n{'='*60}")
    print("TEMPORAL EXPORT COMPLETE!")
    print("="*60)
    print(f"\nFiles saved to: {os.path.abspath(temporal_dir)}")
    print(f"\nTo use in ArcGIS Pro:")
    print(f"  1. Add *_INCREMENTAL_ALL.shp or *_CUMULATIVE_ALL.shp")
    print(f"  2. Enable time using 'time_bin' field")
    print(f"  3. Set time step to 4 hours")
    print(f"  4. Animate!")


print("✓ Main export function defined.")

✓ Main export function defined.


## Cell 10: MAIN EXECUTION - Run All Steps

In [50]:
"""
MAIN EXECUTION - RUNS ENTIRE WORKFLOW
"""

print("="*60)
print("ARCGIS TWEET PROCESSOR - EXACT MATCHING WITH PRIORITY")
print("="*60)
print()

workspace = "in_memory"

# STEP 1: Load all data
print("STEP 1: Loading Data")
print("-" * 60)
tweets_fc = load_tweets_geojson(workspace)
cities_fc = load_cities_csv(workspace)
states_fc = load_states_shapefile(workspace)
counties_fc = load_counties_shapefile(workspace)
print()

# STEP 2: Create lookup dictionaries
print("STEP 2: Creating Lookup Dictionaries")
print("-" * 60)
state_lookup, county_lookup, city_lookup = create_lookup_dictionaries(
    states_fc, counties_fc, cities_fc
)
print()

# STEP 3: Count mentions with temporal binning and cascade
print("STEP 3: Counting Mentions with Priority Hierarchy and Cascade")
print("-" * 60)
(time_bins, temporal_state_mentions, temporal_county_mentions, temporal_city_mentions,
 temporal_state_details, temporal_county_details, temporal_city_details) = \
    count_mentions_with_cascade_temporal(
        tweets_fc, state_lookup, county_lookup, city_lookup,
        states_fc, counties_fc, cities_fc
    )
print()

# STEP 4: Export temporal data
print("STEP 4: Exporting Temporal Data")
print("-" * 60)
export_temporal_to_shapefiles(
    time_bins, temporal_state_mentions, temporal_county_mentions, temporal_city_mentions,
    temporal_state_details, temporal_county_details, temporal_city_details,
    states_fc, counties_fc, cities_fc
)
print()

print("="*60)
print("PROCESSING COMPLETE!")
print("="*60)
print(f"\nTime range: {time_bins[0].strftime('%Y-%m-%d %H:%M:%S')} to {time_bins[-1].strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total time bins: {len(time_bins)}")
print(f"\nAll outputs saved to: arcgis_outputs/temporal_4hour_bins/")

ARCGIS TWEET PROCESSOR - EXACT MATCHING WITH PRIORITY

STEP 1: Loading Data
------------------------------------------------------------
Loading tweets from helene.geojson...


<class 'FileNotFoundError'>: GeoJSON file not found: C:\WINDOWS\system32\in_memory\data\geojson\helene.geojson