# Tweet Processing with ArcPy - CORRECT METHODOLOGY

**Key Understanding**: The GeoJSON coordinates are DERIVED from GPE text, not original tweet locations!

**Methodology**:
1. Extract GPE text field from GeoJSON (ignore geometry - it's just geocoded GPE)
2. Parse GPE text and fuzzy match to reference geographies
3. Count mentions and merge to reference geography polygons
4. Export with temporal binning

**No spatial operations needed** - this is pure text matching!

In [95]:
# ==============================================================================
# SETUP
# ==============================================================================

import arcpy
import os
import re
import json
from datetime import datetime
from collections import defaultdict

project_root = os.getcwd()
gdb_path = os.path.join(project_root, 'data', 'tw_project.gdb')

if not arcpy.Exists(gdb_path):
    arcpy.management.CreateFileGDB(os.path.dirname(gdb_path), os.path.basename(gdb_path))

arcpy.env.workspace = gdb_path
arcpy.env.overwriteOutput = True

print(f"Workspace: {gdb_path}")
print(f"ArcGIS Pro: {arcpy.GetInstallInfo()['Version']}")

Workspace: C:\Users\colto\Documents\tw_project\tw_project\data\tw_project.gdb
ArcGIS Pro: 3.5


In [96]:
# ==============================================================================
# LOAD TWEETS FROM GEOJSON (Extract GPE text only)
# ==============================================================================

print("Loading tweets from GeoJSON...\n")

helene_geojson = os.path.join(project_root, 'data', 'geojson', 'helene.geojson')

# Load GeoJSON and extract just GPE and time fields
tweets_data = []
with open(helene_geojson, 'r') as f:
    geojson = json.load(f)
    
for feature in geojson['features']:
    props = feature['properties']
    tweets_data.append({
        'GPE': props.get('GPE', ''),
        'time': props.get('time', ''),
        'FAC': props.get('FAC', ''),
        'LOC': props.get('LOC', '')
    })

print(f"Loaded {len(tweets_data)} tweets")
print(f"\nSample tweets:")
for i in range(min(5, len(tweets_data))):
    print(f"  {i+1}. GPE: '{tweets_data[i]['GPE']}' | Time: {tweets_data[i]['time']}")

print(f"\n✓ Tweets loaded (GPE text only, geometry ignored)")

Loading tweets from GeoJSON...

Loaded 3007 tweets

Sample tweets:
  1. GPE: 'Florida' | Time: 2024-09-26 22:59:53+00:00
  2. GPE: 'Florida' | Time: 2024-09-26 22:59:49+00:00
  3. GPE: 'Florida' | Time: 2024-09-26 22:59:47+00:00
  4. GPE: 'Tallahassee, Tampa' | Time: 2024-09-26 22:59:43+00:00
  5. GPE: 'Florida' | Time: 2024-09-26 22:59:37+00:00

✓ Tweets loaded (GPE text only, geometry ignored)


In [97]:
# ==============================================================================
# TEXT PARSING FUNCTIONS
# ==============================================================================

def preprocess_place_name(name):
    """Standardize place names for matching"""
    if not name or str(name).strip() == '' or str(name).upper() == 'NAN':
        return None
    name = str(name).upper().strip()
    name = re.sub(r'\bST\.?\b', 'SAINT', name)
    name = re.sub(r'\bMT\.?\b', 'MOUNT', name)
    name = re.sub(r'\bFT\.?\b', 'FORT', name)
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    return name.strip()

def parse_gpe_entities(gpe_string):
    """Split GPE field into individual place mentions"""
    if not gpe_string or str(gpe_string).strip() == '':
        return []
    gpe_string = str(gpe_string).strip()
    entities = []
    for part in [p.strip() for p in gpe_string.split(',')]:
        if not part:
            continue
        for sub in re.split(r'[;&|]', part):
            sub = preprocess_place_name(sub)
            if sub and len(sub) > 1:
                entities.append(sub)
    # Remove duplicates
    seen, clean = set(), []
    for e in entities:
        if e not in seen:
            clean.append(e)
            seen.add(e)
    return clean

def simple_fuzzy_match(query, target):
    """Simple fuzzy matching"""
    query = query.upper()
    target = target.upper()
    
    if query == target:
        return 1.0
    if query in target or target in query:
        return 0.9
    
    q_set = set(query)
    t_set = set(target)
    intersection = len(q_set & t_set)
    union = len(q_set | t_set)
    
    return intersection / union if union > 0 else 0

def match_entity(entity, lookup_dict, threshold=0.85):
    """Match entity to lookup dictionary"""
    if entity in lookup_dict:
        return lookup_dict[entity], 1.0
    
    best_match = None
    best_score = 0
    
    for key in lookup_dict.keys():
        score = simple_fuzzy_match(entity, key)
        if score > best_score and score >= threshold:
            best_score = score
            best_match = key
    
    if best_match:
        return lookup_dict[best_match], best_score
    
    return None, 0

print("✓ Text parsing functions loaded")

✓ Text parsing functions loaded


In [98]:
# ==============================================================================
# LOAD REFERENCE GEOGRAPHY
# ==============================================================================

print("Loading reference geographies...\n")

# States
states_shp = os.path.join(project_root, 'data', 'shape_files', 'cb_2023_us_state_20m.shp')
states_fc = os.path.join(gdb_path, 'us_states')
if arcpy.Exists(states_fc):
    arcpy.management.Delete(states_fc)
arcpy.conversion.FeatureClassToFeatureClass(states_shp, gdb_path, 'us_states')
print(f"States: {arcpy.management.GetCount(states_fc)[0]}")

# Counties  
counties_shp = os.path.join(project_root, 'data', 'shape_files', 'cb_2023_us_county_20m.shp')
counties_fc = os.path.join(gdb_path, 'us_counties')
if arcpy.Exists(counties_fc):
    arcpy.management.Delete(counties_fc)
arcpy.conversion.FeatureClassToFeatureClass(counties_shp, gdb_path, 'us_counties')
print(f"Counties: {arcpy.management.GetCount(counties_fc)[0]}")

# Cities
cities_csv = os.path.join(project_root, 'data', 'tables', 'cities1000.csv')
cities_temp = os.path.join(gdb_path, 'us_cities_temp')
cities_fc = os.path.join(gdb_path, 'us_cities')

if arcpy.Exists(cities_temp):
    arcpy.management.Delete(cities_temp)
if arcpy.Exists(cities_fc):
    arcpy.management.Delete(cities_fc)

arcpy.management.XYTableToPoint(cities_csv, cities_temp, 'longitude', 'latitude',
                                coordinate_system=arcpy.SpatialReference(4326))
arcpy.analysis.Select(cities_temp, cities_fc,
                     "country_code = 'US' AND feature_class = 'P' AND population IS NOT NULL")
arcpy.management.Delete(cities_temp)
print(f"Cities: {arcpy.management.GetCount(cities_fc)[0]}")

print("\n✓ Reference geography loaded")

Loading reference geographies...

States: 52
Counties: 3222
Cities: 17244

✓ Reference geography loaded


In [99]:
# ==============================================================================
# BUILD LOOKUP DICTIONARIES
# ==============================================================================

print("Building lookup dictionaries...\n")

state_lookup = {}  # name -> STUSPS
with arcpy.da.SearchCursor(states_fc, ['NAME', 'STUSPS']) as cursor:
    for row in cursor:
        name_clean = preprocess_place_name(row[0])
        if name_clean:
            state_lookup[name_clean] = row[1]
        # Also add abbreviation
        state_lookup[row[1]] = row[1]

county_lookup = {}  # name -> GEOID
with arcpy.da.SearchCursor(counties_fc, ['NAME', 'GEOID']) as cursor:
    for row in cursor:
        name_clean = preprocess_place_name(row[0])
        if name_clean:
            county_lookup[name_clean] = row[1]

city_lookup = {}  # name -> geonameid
with arcpy.da.SearchCursor(cities_fc, ['name', 'geonameid']) as cursor:
    for row in cursor:
        name_clean = preprocess_place_name(row[0])
        if name_clean:
            city_lookup[name_clean] = row[1]

print(f"States: {len(state_lookup)} entries")
print(f"Counties: {len(county_lookup)} entries")
print(f"Cities: {len(city_lookup)} entries")
print("\n✓ Lookup dictionaries created")

Building lookup dictionaries...

States: 104 entries
Counties: 1915 entries
Cities: 12256 entries

✓ Lookup dictionaries created


In [100]:
# ==============================================================================
# TIME BINNING
# ==============================================================================

print("Creating time bins...\n")

from datetime import datetime, timedelta
import pandas as pd

# Parse times and create bins
for tweet in tweets_data:
    if tweet['time']:
        dt = pd.to_datetime(tweet['time'])
        # Floor to 4-hour boundary
        bin_time = dt.floor('4h')
        tweet['bin'] = bin_time
        tweet['bin_str'] = bin_time.strftime('%Y-%m-%d %H:%M:%S')
    else:
        tweet['bin'] = None
        tweet['bin_str'] = None

# Get unique bins
time_bins = sorted(list(set([t['bin'] for t in tweets_data if t['bin'] is not None])))

print(f"Found {len(time_bins)} time bins")
if len(time_bins) > 0:
    print(f"Range: {time_bins[0]} to {time_bins[-1]}")
    print(f"\nTime bins:")
    for tb in time_bins:
        print(f"  {tb.strftime('%Y-%m-%d %H:%M:%S')}")

print("\n✓ Time bins created")

Creating time bins...

Found 11 time bins
Range: 2024-09-26 00:00:00+00:00 to 2024-09-27 16:00:00+00:00

Time bins:
  2024-09-26 00:00:00
  2024-09-26 04:00:00
  2024-09-26 08:00:00
  2024-09-26 12:00:00
  2024-09-26 16:00:00
  2024-09-26 20:00:00
  2024-09-27 00:00:00
  2024-09-27 04:00:00
  2024-09-27 08:00:00
  2024-09-27 12:00:00
  2024-09-27 16:00:00

✓ Time bins created


In [101]:
# ==============================================================================
# COUNT MENTIONS BY TIME BIN (Pure text matching - NO spatial operations)
# ==============================================================================

print("\n" + "="*70)
print("COUNTING TEXT MENTIONS")
print("="*70 + "\n")

# Initialize temporal tracking
temporal_state_data = {tb: defaultdict(lambda: {'count': 0, 'samples': []}) for tb in time_bins}
temporal_county_data = {tb: defaultdict(lambda: {'count': 0, 'samples': []}) for tb in time_bins}
temporal_city_data = {tb: defaultdict(lambda: {'count': 0, 'samples': []}) for tb in time_bins}

processed = 0
total_tweets = len(tweets_data)

for tweet in tweets_data:
    processed += 1
    if processed % 100 == 0:
        print(f"  Processing {processed}/{total_tweets}...")
    
    time_bin = tweet['bin']
    if not time_bin:
        continue
    
    gpe_text = tweet['GPE']
    entities = parse_gpe_entities(gpe_text)
    
    for entity in entities:
        # Try state match
        state_match, score = match_entity(entity, state_lookup, 0.90)
        if state_match:
            temporal_state_data[time_bin][state_match]['count'] += 1
            temporal_state_data[time_bin][state_match]['samples'].append({
                'entity': entity,
                'gpe': gpe_text[:100] if gpe_text else ''
            })
            continue
        
        # Try county match
        county_match, score = match_entity(entity, county_lookup, 0.85)
        if county_match:
            temporal_county_data[time_bin][county_match]['count'] += 1
            temporal_county_data[time_bin][county_match]['samples'].append({
                'entity': entity,
                'gpe': gpe_text[:100] if gpe_text else ''
            })
            continue
        
        # Try city match
        city_match, score = match_entity(entity, city_lookup, 0.85)
        if city_match:
            temporal_city_data[time_bin][city_match]['count'] += 1
            temporal_city_data[time_bin][city_match]['samples'].append({
                'entity': entity,
                'gpe': gpe_text[:100] if gpe_text else ''
            })

print(f"\n✓ Processed all {processed} tweets")
print("\n" + "="*70)
print("COUNTING COMPLETE")
print("="*70)


COUNTING TEXT MENTIONS

  Processing 100/3007...
  Processing 200/3007...
  Processing 300/3007...
  Processing 400/3007...
  Processing 500/3007...
  Processing 600/3007...
  Processing 700/3007...
  Processing 800/3007...
  Processing 900/3007...
  Processing 1000/3007...
  Processing 1100/3007...
  Processing 1200/3007...
  Processing 1300/3007...
  Processing 1400/3007...
  Processing 1500/3007...
  Processing 1600/3007...
  Processing 1700/3007...
  Processing 1800/3007...
  Processing 1900/3007...
  Processing 2000/3007...
  Processing 2100/3007...
  Processing 2200/3007...
  Processing 2300/3007...
  Processing 2400/3007...
  Processing 2500/3007...
  Processing 2600/3007...
  Processing 2700/3007...
  Processing 2800/3007...
  Processing 2900/3007...
  Processing 3000/3007...

✓ Processed all 3007 tweets

COUNTING COMPLETE


In [102]:
# ==============================================================================
# VALIDATION: Display counts
# ==============================================================================

print("\n" + "="*70)
print("VALIDATION")
print("="*70 + "\n")

# Total counts
total_state_counts = defaultdict(int)
for tb in time_bins:
    for state_code, data in temporal_state_data[tb].items():
        total_state_counts[state_code] += data['count']

print(f"States mentioned: {len(total_state_counts)}")
print(f"\nTop 10 states:")
top_states = sorted(total_state_counts.items(), key=lambda x: x[1], reverse=True)[:10]
for state_code, count in top_states:
    print(f"  {state_code}: {count}")

print(f"\nExpected (from test.ipynb):")
print(f"  FL: 2,156")
print(f"  GA: 369")
print(f"  NC: 69")


VALIDATION

States mentioned: 35

Top 10 states:
  FL: 2166
  GA: 377
  LA: 277
  PA: 137
  OR: 115
  MA: 82
  NC: 70
  IN: 65
  CO: 59
  SC: 55

Expected (from test.ipynb):
  FL: 2,156
  GA: 369
  NC: 69


In [103]:
# ==============================================================================
# EXPORT: Create temporal feature classes
# ==============================================================================

print("\n" + "="*70)
print("EXPORTING TEMPORAL FEATURE CLASSES")
print("="*70 + "\n")

def export_temporal(geography_fc, geography_name, id_field, temporal_data_dict):
    print(f"\nExporting {geography_name}...")
    
    incremental_fcs = []
    cumulative_fcs = []
    cumulative_counts = {}
    
    for idx, time_bin in enumerate(time_bins):
        bin_str = time_bin.strftime('%Y%m%d_%H%M')
        bin_label = time_bin.strftime('%Y-%m-%d %H:%M:%S')
        
        print(f"  Bin {idx+1}/{len(time_bins)}: {bin_label}")
        
        # INCREMENTAL
        inc_name = f"{geography_name}_inc_{bin_str}"
        inc_fc = os.path.join(gdb_path, inc_name)
        arcpy.management.Copy(geography_fc, inc_fc)
        
        arcpy.management.AddField(inc_fc, 'tweet_cnt', 'LONG')
        arcpy.management.AddField(inc_fc, 'smpl_ment', 'TEXT', field_length=254)
        arcpy.management.AddField(inc_fc, 'time_bin_str', 'TEXT', field_length=50)
        
        with arcpy.da.UpdateCursor(inc_fc, [id_field, 'tweet_cnt', 'smpl_ment', 'time_bin_str']) as cursor:
            for row in cursor:
                entity_id = row[0]
                if entity_id in temporal_data_dict[time_bin]:
                    data = temporal_data_dict[time_bin][entity_id]
                    row[1] = data['count']
                    samples = data['samples'][:3]
                    row[2] = '; '.join([s['entity'][:50] for s in samples])
                else:
                    row[1] = 0
                    row[2] = ''
                row[3] = bin_label
                cursor.updateRow(row)
        
        incremental_fcs.append(inc_fc)
        
        # Update cumulative
        for entity_id, data in temporal_data_dict[time_bin].items():
            cumulative_counts[entity_id] = cumulative_counts.get(entity_id, 0) + data['count']
        
        # CUMULATIVE
        cum_name = f"{geography_name}_cum_{bin_str}"
        cum_fc = os.path.join(gdb_path, cum_name)
        arcpy.management.Copy(geography_fc, cum_fc)
        
        arcpy.management.AddField(cum_fc, 'cumul_cnt', 'LONG')
        arcpy.management.AddField(cum_fc, 'time_bin_str', 'TEXT', field_length=50)
        
        with arcpy.da.UpdateCursor(cum_fc, [id_field, 'cumul_cnt', 'time_bin_str']) as cursor:
            for row in cursor:
                entity_id = row[0]
                row[1] = cumulative_counts.get(entity_id, 0)
                row[2] = bin_label
                cursor.updateRow(row)
        
        cumulative_fcs.append(cum_fc)
    
    return incremental_fcs, cumulative_fcs

states_inc, states_cum = export_temporal(states_fc, 'states', 'STUSPS', temporal_state_data)
counties_inc, counties_cum = export_temporal(counties_fc, 'counties', 'GEOID', temporal_county_data)
cities_inc, cities_cum = export_temporal(cities_fc, 'cities', 'geonameid', temporal_city_data)

print("\n✓ Temporal feature classes created")


EXPORTING TEMPORAL FEATURE CLASSES


Exporting states...
  Bin 1/11: 2024-09-26 00:00:00
  Bin 2/11: 2024-09-26 04:00:00
  Bin 3/11: 2024-09-26 08:00:00
  Bin 4/11: 2024-09-26 12:00:00
  Bin 5/11: 2024-09-26 16:00:00
  Bin 6/11: 2024-09-26 20:00:00
  Bin 7/11: 2024-09-27 00:00:00
  Bin 8/11: 2024-09-27 04:00:00
  Bin 9/11: 2024-09-27 08:00:00
  Bin 10/11: 2024-09-27 12:00:00
  Bin 11/11: 2024-09-27 16:00:00

Exporting counties...
  Bin 1/11: 2024-09-26 00:00:00
  Bin 2/11: 2024-09-26 04:00:00
  Bin 3/11: 2024-09-26 08:00:00
  Bin 4/11: 2024-09-26 12:00:00
  Bin 5/11: 2024-09-26 16:00:00
  Bin 6/11: 2024-09-26 20:00:00
  Bin 7/11: 2024-09-27 00:00:00
  Bin 8/11: 2024-09-27 04:00:00
  Bin 9/11: 2024-09-27 08:00:00
  Bin 10/11: 2024-09-27 12:00:00
  Bin 11/11: 2024-09-27 16:00:00

Exporting cities...
  Bin 1/11: 2024-09-26 00:00:00
  Bin 2/11: 2024-09-26 04:00:00
  Bin 3/11: 2024-09-26 08:00:00
  Bin 4/11: 2024-09-26 12:00:00
  Bin 5/11: 2024-09-26 16:00:00
  Bin 6/11: 2024-09-26 20:00:0

In [104]:
# ==============================================================================
# MERGE MASTER FILES
# ==============================================================================

print("\nMerging master files...\n")

# Incremental
states_inc_all = os.path.join(gdb_path, 'states_INCREMENTAL_ALL')
arcpy.management.Merge(states_inc, states_inc_all)
print(f"states_INCREMENTAL_ALL: {arcpy.management.GetCount(states_inc_all)[0]} records")

counties_inc_all = os.path.join(gdb_path, 'counties_INCREMENTAL_ALL')
arcpy.management.Merge(counties_inc, counties_inc_all)
print(f"counties_INCREMENTAL_ALL: {arcpy.management.GetCount(counties_inc_all)[0]} records")

cities_inc_all = os.path.join(gdb_path, 'cities_INCREMENTAL_ALL')
arcpy.management.Merge(cities_inc, cities_inc_all)
print(f"cities_INCREMENTAL_ALL: {arcpy.management.GetCount(cities_inc_all)[0]} records")

# Cumulative
states_cum_all = os.path.join(gdb_path, 'states_CUMULATIVE_ALL')
arcpy.management.Merge(states_cum, states_cum_all)
print(f"\nstates_CUMULATIVE_ALL: {arcpy.management.GetCount(states_cum_all)[0]} records")

counties_cum_all = os.path.join(gdb_path, 'counties_CUMULATIVE_ALL')
arcpy.management.Merge(counties_cum, counties_cum_all)
print(f"counties_CUMULATIVE_ALL: {arcpy.management.GetCount(counties_cum_all)[0]} records")

cities_cum_all = os.path.join(gdb_path, 'cities_CUMULATIVE_ALL')
arcpy.management.Merge(cities_cum, cities_cum_all)
print(f"cities_CUMULATIVE_ALL: {arcpy.management.GetCount(cities_cum_all)[0]} records")

print("\n" + "="*70)
print("✓ COMPLETE - All outputs in tw_project.gdb")
print("="*70)


Merging master files...

states_INCREMENTAL_ALL: 572 records
counties_INCREMENTAL_ALL: 35442 records
cities_INCREMENTAL_ALL: 189684 records

states_CUMULATIVE_ALL: 572 records
counties_CUMULATIVE_ALL: 35442 records
cities_CUMULATIVE_ALL: 189684 records

✓ COMPLETE - All outputs in tw_project.gdb
