# ArcGIS Pro Tweet Mention Processing

This notebook recreates the tweet mention aggregation workflow with `arcpy` so that it can run directly inside ArcGIS Pro 3.5 without relying on GeoPandas. It loads the same data inputs as the original workflow, counts geopolitical entity mentions, bins them into four-hour intervals, and exports incremental and cumulative feature classes and CSV summaries that are ready to use in ArcGIS Pro.


In [39]:
import arcpy
import os
import re
import json
import csv
import difflib
import datetime as dt
import uuid
from collections import defaultdict, OrderedDict

arcpy.env.overwriteOutput = True


In [40]:
PROJECT_OUTPUT_FOLDER = 'arcgis_outputs'
FOUR_HOUR_INTERVAL = 4


def get_project_root():
    return os.getcwd()


def get_data_file_path(*segments):
    return os.path.join(get_project_root(), *segments)


def ensure_directory(path):
    os.makedirs(path, exist_ok=True)
    return path


def find_field_case_insensitive(dataset, field_name):
    target = field_name.lower()
    for field in arcpy.ListFields(dataset):
        if field.name.lower() == target:
            return field.name
    raise ValueError(f"Field '{field_name}' not found in {dataset}")


In [41]:
def load_tweet_feature_class():
    geojson_path = get_data_file_path('data', 'geojson', 'helene.geojson')
    scratch_gdb = arcpy.env.scratchGDB
    ensure_directory(os.path.dirname(scratch_gdb))

    intermediate_name = os.path.join(scratch_gdb, 'tweets_mentions_raw')
    projected_name = os.path.join(scratch_gdb, 'tweets_mentions_4326')
    wgs84 = arcpy.SpatialReference(4326)

    if arcpy.Exists(intermediate_name):
        arcpy.management.Delete(intermediate_name)
    if arcpy.Exists(projected_name):
        arcpy.management.Delete(projected_name)

    result = arcpy.conversion.JSONToFeatures(geojson_path, intermediate_name)
    tweet_fc = result.getOutput(0)

    desc = arcpy.Describe(tweet_fc)
    sr = getattr(desc, 'spatialReference', None)
    if not sr or not getattr(sr, 'name', None):
        arcpy.management.DefineProjection(tweet_fc, wgs84)
        sr = wgs84

    if sr.factoryCode == wgs84.factoryCode:
        arcpy.management.CopyFeatures(tweet_fc, projected_name)
    else:
        arcpy.management.Project(tweet_fc, projected_name, wgs84)

    return projected_name


def load_states_feature_class():
    source = get_data_file_path('data', 'shape_files', 'cb_2023_us_state_20m.shp')
    scratch_gdb = arcpy.env.scratchGDB
    output_fc = os.path.join(scratch_gdb, 'us_states_4326')
    if arcpy.Exists(output_fc):
        arcpy.management.Delete(output_fc)
    arcpy.management.Project(source, output_fc, arcpy.SpatialReference(4326))
    return output_fc


def load_counties_feature_class():
    source = get_data_file_path('data', 'shape_files', 'cb_2023_us_county_20m.shp')
    scratch_gdb = arcpy.env.scratchGDB
    output_fc = os.path.join(scratch_gdb, 'us_counties_4326')
    if arcpy.Exists(output_fc):
        arcpy.management.Delete(output_fc)
    arcpy.management.Project(source, output_fc, arcpy.SpatialReference(4326))
    return output_fc


def load_cities_feature_class():
    csv_path = get_data_file_path('data', 'tables', 'cities1000.csv')
    scratch_gdb = arcpy.env.scratchGDB
    view_name = f"cities_view_{uuid.uuid4().hex[:6]}"
    table_view = arcpy.management.MakeTableView(csv_path, view_name)

    expression = "country_code = 'US' AND feature_class = 'P' AND population IS NOT NULL AND latitude IS NOT NULL AND longitude IS NOT NULL"
    arcpy.management.SelectLayerByAttribute(table_view, 'NEW_SELECTION', expression)

    temp_table = os.path.join('in_memory', f"cities_{uuid.uuid4().hex[:6]}")
    arcpy.management.CopyRows(table_view, temp_table)

    output_fc = os.path.join(scratch_gdb, 'us_cities_4326')
    if arcpy.Exists(output_fc):
        arcpy.management.Delete(output_fc)

    arcpy.management.XYTableToPoint(temp_table, output_fc, 'longitude', 'latitude', coordinate_system=arcpy.SpatialReference(4326))

    arcpy.management.Delete(table_view)
    arcpy.management.Delete(temp_table)
    return output_fc


In [42]:
def preprocess_place_name(value):
    if value is None:
        return None
    name = str(value).upper().strip()
    if not name or name in {'', 'NAN'}:
        return None
    name = re.sub(r"\bST\.?\b", 'SAINT', name)
    name = re.sub(r"\bMT\.?\b", 'MOUNT', name)
    name = re.sub(r"\bFT\.?\b", 'FORT', name)
    name = re.sub(r"[^A-Z0-9\s]", '', name)
    name = re.sub(r"\s+", ' ', name)
    return name.strip()


def parse_gpe_entities(text):
    if text is None:
        return []
    value = str(text).strip()
    if not value:
        return []
    entities = []
    for part in [segment.strip() for segment in value.split(',') if segment.strip()]:
        for sub_part in re.split(r'[;&|]', part):
            normalized = preprocess_place_name(sub_part)
            if normalized and len(normalized) > 1:
                entities.append(normalized)
    unique = []
    seen = set()
    for ent in entities:
        if ent not in seen:
            unique.append(ent)
            seen.add(ent)
    return unique


def fuzzy_match_entity(entity, lookup_dict, threshold=0.85):
    if entity in lookup_dict:
        return lookup_dict[entity], 1.0
    if not lookup_dict:
        return None, 0.0
    choices = list(lookup_dict.keys())
    matches = difflib.get_close_matches(entity, choices, n=1, cutoff=threshold)
    if matches:
        key = matches[0]
        score = difflib.SequenceMatcher(None, entity, key).ratio()
        if score >= threshold:
            return lookup_dict[key], score
    return None, 0.0


def parse_datetime(value):
    if isinstance(value, dt.datetime):
        return value.replace(tzinfo=None)
    if value in (None, '', 'nan', 'NaN'):
        return None
    text = str(value).strip()
    if not text:
        return None
    text = text.replace('Z', '+00:00')
    formats = [
        '%Y-%m-%d %H:%M:%S',
        '%Y-%m-%d %H:%M',
        '%Y-%m-%dT%H:%M:%S',
        '%Y-%m-%dT%H:%M:%S.%f',
        '%Y-%m-%dT%H:%M:%S%z',
        '%Y-%m-%dT%H:%M:%S.%f%z'
    ]
    for fmt in formats:
        try:
            parsed = dt.datetime.strptime(text, fmt)
            if parsed.tzinfo:
                parsed = parsed.astimezone(dt.timezone.utc).replace(tzinfo=None)
            return parsed
        except ValueError:
            continue
    try:
        parsed = dt.datetime.fromisoformat(text)
        if parsed.tzinfo:
            parsed = parsed.astimezone(dt.timezone.utc).replace(tzinfo=None)
        return parsed
    except ValueError:
        return None


def floor_datetime(value, hours=4):
    if value is None:
        return None
    hour = (value.hour // hours) * hours
    return value.replace(hour=hour, minute=0, second=0, microsecond=0)


In [43]:
def create_lookup_dictionaries(states_fc, counties_fc, cities_fc):
    state_lookup = {}
    state_field = find_field_case_insensitive(states_fc, 'STUSPS')
    state_name_field = find_field_case_insensitive(states_fc, 'NAME')

    with arcpy.da.SearchCursor(states_fc, [state_field, state_name_field]) as cursor:
        for stusps, name in cursor:
            normalized = preprocess_place_name(name)
            if not normalized:
                continue
            entry = {
                'STUSPS': stusps,
                'NAME': name,
                'lookup_key': normalized
            }
            state_lookup[normalized] = entry
            if stusps:
                state_lookup[str(stusps).upper()] = entry

    county_lookup = {}
    county_id_field = find_field_case_insensitive(counties_fc, 'GEOID')
    county_name_field = find_field_case_insensitive(counties_fc, 'NAME')

    with arcpy.da.SearchCursor(counties_fc, [county_id_field, county_name_field]) as cursor:
        for geoid, name in cursor:
            normalized = preprocess_place_name(name)
            if not normalized:
                continue
            county_lookup[normalized] = {
                'GEOID': str(geoid),
                'NAME': name
            }

    city_lookup = {}
    city_id_field = find_field_case_insensitive(cities_fc, 'geonameid')
    city_name_field = find_field_case_insensitive(cities_fc, 'name')
    lat_field = find_field_case_insensitive(cities_fc, 'latitude')
    lon_field = find_field_case_insensitive(cities_fc, 'longitude')
    pop_field = find_field_case_insensitive(cities_fc, 'population')

    with arcpy.da.SearchCursor(cities_fc, [city_id_field, city_name_field, lat_field, lon_field, pop_field]) as cursor:
        for geonameid, name, lat, lon, pop in cursor:
            normalized = preprocess_place_name(name)
            if not normalized:
                continue
            entry = {
                'geonameid': str(geonameid),
                'name': name,
                'latitude': float(lat) if lat not in (None, '') else None,
                'longitude': float(lon) if lon not in (None, '') else None,
                'population': int(pop) if pop not in (None, '') else None
            }
            city_lookup[normalized] = entry
    return state_lookup, county_lookup, city_lookup


In [44]:
def load_tweet_records(tweets_fc):
    gpe_field = find_field_case_insensitive(tweets_fc, 'GPE')
    time_field_name = None
    for candidate in ['time', 'timestamp', 'created_at', 'createdat']:
        try:
            time_field_name = find_field_case_insensitive(tweets_fc, candidate)
            break
        except ValueError:
            continue
    if time_field_name is None:
        raise ValueError('No time-like field found in the tweet feature class.')

    records = []
    with arcpy.da.SearchCursor(tweets_fc, ['OID@', gpe_field, time_field_name]) as cursor:
        for oid, gpe_text, time_value in cursor:
            gpe_text = gpe_text if gpe_text not in (None, '') else ''
            parsed_time = parse_datetime(time_value)
            record = {
                'oid': oid,
                'original_gpe': gpe_text,
                'entities': parse_gpe_entities(gpe_text),
                'time': parsed_time,
                'time_string': parsed_time.isoformat() if parsed_time else ''
            }
            records.append(record)
    return records


def assign_time_bins(records, bin_hours=FOUR_HOUR_INTERVAL):
    bins = set()
    for record in records:
        record['time_bin'] = floor_datetime(record['time'], hours=bin_hours)
        if record['time_bin'] is not None:
            bins.add(record['time_bin'])
    return sorted(bins)


In [34]:
def count_mentions_in_tweets(records, state_lookup, county_lookup, city_lookup):
    state_mentions = defaultdict(int)
    county_mentions = defaultdict(int)
    city_mentions = defaultdict(int)

    state_details = defaultdict(list)
    county_details = defaultdict(list)
    city_details = defaultdict(list)

    for record in records:
        for entity in record['entities']:
            state_match, state_score = fuzzy_match_entity(entity, state_lookup, threshold=0.9)
            if state_match:
                state_code = state_match['STUSPS']
                state_mentions[state_code] += 1
                state_details[state_code].append({
                    'matched_entity': entity,
                    'original_gpe': record['original_gpe'],
                    'time': record['time_string']
                })
                continue

            county_match, county_score = fuzzy_match_entity(entity, county_lookup, threshold=0.85)
            if county_match:
                county_id = county_match['GEOID']
                county_mentions[county_id] += 1
                county_details[county_id].append({
                    'matched_entity': entity,
                    'original_gpe': record['original_gpe'],
                    'time': record['time_string']
                })
                continue

            city_match, city_score = fuzzy_match_entity(entity, city_lookup, threshold=0.85)
            if city_match:
                city_id = city_match['geonameid']
                city_mentions[city_id] += 1
                city_details[city_id].append({
                    'matched_entity': entity,
                    'original_gpe': record['original_gpe'],
                    'time': record['time_string']
                })

    return (
        state_mentions,
        county_mentions,
        city_mentions,
        state_details,
        county_details,
        city_details
    )


In [35]:
def create_overall_summaries(state_mentions, county_mentions, city_mentions,
                             state_details, county_details, city_details,
                             city_lookup):
    states_summary = []
    for code, count in state_mentions.items():
        samples = state_details.get(code, [])
        states_summary.append({
            'state_code': code,
            'tweet_count': count,
            'sample_mentions': '; '.join(detail['matched_entity'] for detail in samples[:5]),
            'sample_gpe_text': '; '.join(detail['original_gpe'][:100] for detail in samples[:3])
        })

    counties_summary = []
    for fips, count in county_mentions.items():
        samples = county_details.get(fips, [])
        counties_summary.append({
            'county_fips': fips,
            'tweet_count': count,
            'sample_mentions': '; '.join(detail['matched_entity'] for detail in samples[:5]),
            'sample_gpe_text': '; '.join(detail['original_gpe'][:100] for detail in samples[:3])
        })

    cities_summary = []
    for city_id, count in city_mentions.items():
        samples = city_details.get(city_id, [])
        matched = '; '.join(detail['matched_entity'] for detail in samples)
        original = ' | '.join(detail['original_gpe'] for detail in samples)
        lookup_info = None
        for entry in city_lookup.values():
            if entry['geonameid'] == city_id:
                lookup_info = entry
                break
        cities_summary.append({
            'city_id': city_id,
            'tweet_count': count,
            'matched_entities': matched,
            'original_gpe_text': original,
            'name': lookup_info['name'] if lookup_info else None,
            'population': lookup_info['population'] if lookup_info else None,
            'latitude': lookup_info['latitude'] if lookup_info else None,
            'longitude': lookup_info['longitude'] if lookup_info else None
        })

    return states_summary, counties_summary, cities_summary


In [36]:
def count_mentions_in_tweets_temporal(records, time_bins, state_lookup, county_lookup, city_lookup):
    temporal_state_mentions = {bin_time: defaultdict(int) for bin_time in time_bins}
    temporal_county_mentions = {bin_time: defaultdict(int) for bin_time in time_bins}
    temporal_city_mentions = {bin_time: defaultdict(int) for bin_time in time_bins}

    temporal_state_details = {bin_time: defaultdict(list) for bin_time in time_bins}
    temporal_county_details = {bin_time: defaultdict(list) for bin_time in time_bins}
    temporal_city_details = {bin_time: defaultdict(list) for bin_time in time_bins}

    for record in records:
        bin_time = record.get('time_bin')
        if bin_time is None:
            continue
        for entity in record['entities']:
            state_match, _ = fuzzy_match_entity(entity, state_lookup, threshold=0.9)
            if state_match:
                state_code = state_match['STUSPS']
                temporal_state_mentions[bin_time][state_code] += 1
                temporal_state_details[bin_time][state_code].append({
                    'matched_entity': entity,
                    'original_gpe': record['original_gpe'],
                    'time': record['time_string']
                })
                continue

            county_match, _ = fuzzy_match_entity(entity, county_lookup, threshold=0.85)
            if county_match:
                county_id = county_match['GEOID']
                temporal_county_mentions[bin_time][county_id] += 1
                temporal_county_details[bin_time][county_id].append({
                    'matched_entity': entity,
                    'original_gpe': record['original_gpe'],
                    'time': record['time_string']
                })
                continue

            city_match, _ = fuzzy_match_entity(entity, city_lookup, threshold=0.85)
            if city_match:
                city_id = city_match['geonameid']
                temporal_city_mentions[bin_time][city_id] += 1
                temporal_city_details[bin_time][city_id].append({
                    'matched_entity': entity,
                    'original_gpe': record['original_gpe'],
                    'time': record['time_string']
                })

    return (
        temporal_state_mentions,
        temporal_county_mentions,
        temporal_city_mentions,
        temporal_state_details,
        temporal_county_details,
        temporal_city_details
    )


In [48]:
def create_temporal_aggregations(time_bins, temporal_state_mentions, temporal_county_mentions,
                                 temporal_city_mentions, temporal_state_details, temporal_county_details,
                                 temporal_city_details):
    temporal_data = OrderedDict()
    for bin_time in time_bins:
        states = []
        for state_code, count in temporal_state_mentions[bin_time].items():
            details = temporal_state_details[bin_time][state_code]
            states.append({
                'state_code': state_code,
                'tweet_count': count,
                'sample_gpe_text': ' | '.join(detail['original_gpe'][:100] for detail in details[:3])
            })

        counties = []
        for county_id, count in temporal_county_mentions[bin_time].items():
            details = temporal_county_details[bin_time][county_id]
            counties.append({
                'county_fips': county_id,
                'tweet_count': count,
                'sample_gpe_text': ' | '.join(detail['original_gpe'][:100] for detail in details[:3])
            })

        cities = []
        for city_id, count in temporal_city_mentions[bin_time].items():
            details = temporal_city_details[bin_time][city_id]
            cities.append({
                'city_id': city_id,
                'tweet_count': count,
                'original_gpe_text': ' | '.join(detail['original_gpe'] for detail in details),
                'matched_entities': '; '.join(detail['matched_entity'] for detail in details)
            })

        temporal_data[bin_time] = {
            'states': states,
            'counties': counties,
            'cities': cities
        }
    return temporal_data


In [50]:
def build_where_clause(dataset, field_name, values):
    if not values:
        return None
    field = arcpy.AddFieldDelimiters(dataset, field_name)
    cleaned = []
    is_numeric = all(isinstance(v, (int, float)) for v in values)
    for value in values:
        if value in (None, ''):
            continue
        if is_numeric:
            cleaned.append(str(value))
        else:
            cleaned.append("'{}'".format(str(value).replace("'", "''")))

    if not cleaned:
        return None
    return f"{field} IN ({', '.join(cleaned)})"


def copy_and_enrich_features(base_fc, id_field, items, output_path, field_definitions, constant_values=None):
    if not items:
        return None
    constant_values = constant_values or {}
    id_values = [item['id'] for item in items if item.get('id') not in (None, '')]
    where_clause = build_where_clause(base_fc, id_field, id_values)
    if not where_clause:
        return None

    layer_name = f"lyr_{uuid.uuid4().hex[:8]}"
    arcpy.management.MakeFeatureLayer(base_fc, layer_name, where_clause)
    arcpy.management.CopyFeatures(layer_name, output_path)
    arcpy.management.Delete(layer_name)

    existing_fields = {field.name for field in arcpy.ListFields(output_path)}
    for field_name, field_type, params in field_definitions:
        if field_name not in existing_fields:
            if params:
                arcpy.management.AddField(output_path, field_name, field_type, **params)
            else:
                arcpy.management.AddField(output_path, field_name, field_type)

    value_lookup = {item['id']: item for item in items}
    update_fields = [id_field] + [field_name for field_name, _, _ in field_definitions]
    with arcpy.da.UpdateCursor(output_path, update_fields) as cursor:
        for row in cursor:
            key = row[0]
            data = value_lookup.get(key)
            if not data:
                continue
            for idx, field_name in enumerate(update_fields[1:], start=1):
                if field_name in constant_values:
                    row[idx] = constant_values[field_name]
                    continue
                if field_name in data:
                    value = data[field_name]
                    if isinstance(value, str):
                        params = next((p for f, _, p in field_definitions if f == field_name), {})
                        max_length = params.get('field_length')
                        if max_length:
                            row[idx] = value[:max_length]
                        else:
                            row[idx] = value
                    else:
                        row[idx] = value
            cursor.updateRow(row)
    return output_path


def merge_feature_classes(inputs, output_path):
    if not inputs:
        return None
    if len(inputs) == 1:
        arcpy.management.CopyFeatures(inputs[0], output_path)
    else:
        arcpy.management.Merge(inputs, output_path)
    return output_path


def write_city_csv(path, rows, fieldnames):
    if not rows:
        return None
    with open(path, 'w', newline='', encoding='utf-8') as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            writer.writerow(row)
    return path


In [51]:
def export_temporal_to_arcgis(temporal_data, time_bins, states_fc, counties_fc, cities_fc,
                               city_lookup, output_dir=PROJECT_OUTPUT_FOLDER):
    temporal_dir = ensure_directory(get_data_file_path(output_dir, 'temporal_4hour_bins'))
    incremental_dir = ensure_directory(os.path.join(temporal_dir, 'incremental'))
    cumulative_dir = ensure_directory(os.path.join(temporal_dir, 'cumulative'))

    state_id_field = find_field_case_insensitive(states_fc, 'STUSPS')
    county_id_field = find_field_case_insensitive(counties_fc, 'GEOID')
    city_id_field = find_field_case_insensitive(cities_fc, 'geonameid')

    states_incremental_fields = [
        ('tweet_cnt', 'LONG', {}),
        ('smpl_gpe', 'TEXT', {'field_length': 254}),
        ('time_bin', 'TEXT', {'field_length': 32}),
        ('bin_start', 'DATE', {}),
        ('cnt_type', 'TEXT', {'field_length': 12})
    ]

    states_cumulative_fields = [
        ('cumul_cnt', 'LONG', {}),
        ('time_bin', 'TEXT', {'field_length': 32}),
        ('bin_start', 'DATE', {}),
        ('cnt_type', 'TEXT', {'field_length': 12})
    ]

    counties_incremental_fields = [
        ('tweet_cnt', 'LONG', {}),
        ('smpl_gpe', 'TEXT', {'field_length': 254}),
        ('time_bin', 'TEXT', {'field_length': 32}),
        ('bin_start', 'DATE', {}),
        ('cnt_type', 'TEXT', {'field_length': 12})
    ]

    counties_cumulative_fields = [
        ('cumul_cnt', 'LONG', {}),
        ('time_bin', 'TEXT', {'field_length': 32}),
        ('bin_start', 'DATE', {}),
        ('cnt_type', 'TEXT', {'field_length': 12})
    ]

    cities_incremental_fields = [
        ('tweet_cnt', 'LONG', {}),
        ('mtchd_ent', 'TEXT', {'field_length': 254}),
        ('orig_gpe', 'TEXT', {'field_length': 254}),
        ('time_bin', 'TEXT', {'field_length': 32}),
        ('bin_start', 'DATE', {}),
        ('cnt_type', 'TEXT', {'field_length': 12})
    ]

    cities_cumulative_fields = [
        ('cumul_cnt', 'LONG', {}),
        ('time_bin', 'TEXT', {'field_length': 32}),
        ('bin_start', 'DATE', {}),
        ('cnt_type', 'TEXT', {'field_length': 12})
    ]

    all_states_incremental = []
    all_states_cumulative = []
    all_counties_incremental = []
    all_counties_cumulative = []
    all_cities_incremental = []
    all_cities_cumulative = []

    city_incremental_rows = []
    city_cumulative_rows = []

    cumulative_state_counts = OrderedDict()
    cumulative_county_counts = OrderedDict()
    cumulative_city_counts = OrderedDict()

    all_mentioned_states = set()
    all_mentioned_counties = set()
    all_mentioned_cities = set()

    for counts in temporal_data.values():
        all_mentioned_states.update(entry['state_code'] for entry in counts['states'])
        all_mentioned_counties.update(entry['county_fips'] for entry in counts['counties'])
        all_mentioned_cities.update(entry['city_id'] for entry in counts['cities'])

    for index, bin_time in enumerate(time_bins, start=1):
        bin_label = bin_time.strftime('%Y-%m-%d %H:%M:%S')
        bin_str = bin_time.strftime('%Y%m%d_%H%M')
        counts = temporal_data[bin_time]

        state_items = []
        for entry in counts['states']:
            cumulative_state_counts[entry['state_code']] = cumulative_state_counts.get(entry['state_code'], 0) + entry['tweet_count']
            state_items.append({
                'id': entry['state_code'],
                'tweet_cnt': entry['tweet_count'],
                'smpl_gpe': entry['sample_gpe_text']
            })
        if state_items:
            output_path = os.path.join(incremental_dir, f'states_inc_{bin_str}.shp')
            copy_and_enrich_features(
                states_fc,
                state_id_field,
                state_items,
                output_path,
                states_incremental_fields,
                {'time_bin': bin_label, 'bin_start': bin_time, 'cnt_type': 'incremental'}
            )
            all_states_incremental.append(output_path)

        cumulative_state_items = []
        for state_code, total in cumulative_state_counts.items():
            if state_code in all_mentioned_states:
                cumulative_state_items.append({'id': state_code, 'cumul_cnt': total})
        if cumulative_state_items:
            output_path = os.path.join(cumulative_dir, f'states_cum_{bin_str}.shp')
            copy_and_enrich_features(
                states_fc,
                state_id_field,
                cumulative_state_items,
                output_path,
                states_cumulative_fields,
                {'time_bin': bin_label, 'bin_start': bin_time, 'cnt_type': 'cumulative'}
            )
            all_states_cumulative.append(output_path)

        county_items = []
        for entry in counts['counties']:
            cumulative_county_counts[entry['county_fips']] = cumulative_county_counts.get(entry['county_fips'], 0) + entry['tweet_count']
            county_items.append({
                'id': entry['county_fips'],
                'tweet_cnt': entry['tweet_count'],
                'smpl_gpe': entry['sample_gpe_text']
            })
        if county_items:
            output_path = os.path.join(incremental_dir, f'counties_inc_{bin_str}.shp')
            copy_and_enrich_features(
                counties_fc,
                county_id_field,
                county_items,
                output_path,
                counties_incremental_fields,
                {'time_bin': bin_label, 'bin_start': bin_time, 'cnt_type': 'incremental'}
            )
            all_counties_incremental.append(output_path)

        cumulative_county_items = []
        for county_id, total in cumulative_county_counts.items():
            if county_id in all_mentioned_counties:
                cumulative_county_items.append({'id': county_id, 'cumul_cnt': total})
        if cumulative_county_items:
            output_path = os.path.join(cumulative_dir, f'counties_cum_{bin_str}.shp')
            copy_and_enrich_features(
                counties_fc,
                county_id_field,
                cumulative_county_items,
                output_path,
                counties_cumulative_fields,
                {'time_bin': bin_label, 'bin_start': bin_time, 'cnt_type': 'cumulative'}
            )
            all_counties_cumulative.append(output_path)

        city_items = []
        for entry in counts['cities']:
            cumulative_city_counts[entry['city_id']] = cumulative_city_counts.get(entry['city_id'], 0) + entry['tweet_count']
            city_items.append({
                'id': entry['city_id'],
                'tweet_cnt': entry['tweet_count'],
                'mtchd_ent': entry['matched_entities'],
                'orig_gpe': entry['original_gpe_text']
            })
        if city_items:
            output_path = os.path.join(incremental_dir, f'cities_inc_{bin_str}.shp')
            copy_and_enrich_features(
                cities_fc,
                city_id_field,
                city_items,
                output_path,
                cities_incremental_fields,
                {'time_bin': bin_label, 'bin_start': bin_time, 'cnt_type': 'incremental'}
            )
            all_cities_incremental.append(output_path)

            for entry in counts['cities']:
                city_info = None
                for lookup_entry in city_lookup.values():
                    if lookup_entry['geonameid'] == entry['city_id']:
                        city_info = lookup_entry
                        break
                if city_info:
                    city_incremental_rows.append({
                        'city_name': city_info['name'],
                        'city_id': entry['city_id'],
                        'population': city_info['population'],
                        'latitude': city_info['latitude'],
                        'longitude': city_info['longitude'],
                        'tweet_count': entry['tweet_count'],
                        'matched_entities': entry['matched_entities'],
                        'original_gpe_text': entry['original_gpe_text'],
                        'time_bin': bin_label,
                        'bin_start': bin_time.isoformat()
                    })

        cumulative_city_items = []
        for city_id, total in cumulative_city_counts.items():
            if city_id in all_mentioned_cities:
                cumulative_city_items.append({'id': city_id, 'cumul_cnt': total})
        if cumulative_city_items:
            output_path = os.path.join(cumulative_dir, f'cities_cum_{bin_str}.shp')
            copy_and_enrich_features(
                cities_fc,
                city_id_field,
                cumulative_city_items,
                output_path,
                cities_cumulative_fields,
                {'time_bin': bin_label, 'bin_start': bin_time, 'cnt_type': 'cumulative'}
            )
            all_cities_cumulative.append(output_path)

    states_inc_master = merge_feature_classes(
        all_states_incremental,
        os.path.join(incremental_dir, 'states_INCREMENTAL_ALL.shp')
    )
    counties_inc_master = merge_feature_classes(
        all_counties_incremental,
        os.path.join(incremental_dir, 'counties_INCREMENTAL_ALL.shp')
    )
    cities_inc_master = merge_feature_classes(
        all_cities_incremental,
        os.path.join(incremental_dir, 'cities_INCREMENTAL_ALL.shp')
    )

    states_cum_master = merge_feature_classes(
        all_states_cumulative,
        os.path.join(cumulative_dir, 'states_CUMULATIVE_ALL.shp')
    )
    counties_cum_master = merge_feature_classes(
        all_counties_cumulative,
        os.path.join(cumulative_dir, 'counties_CUMULATIVE_ALL.shp')
    )
    cities_cum_master = merge_feature_classes(
        all_cities_cumulative,
        os.path.join(cumulative_dir, 'cities_CUMULATIVE_ALL.shp')
    )

    if cities_inc_master:
        write_city_csv(
            os.path.join(incremental_dir, 'cities_INCREMENTAL_ALL.csv'),
            city_incremental_rows,
            [
                'city_name', 'city_id', 'population', 'latitude', 'longitude',
                'tweet_count', 'matched_entities', 'original_gpe_text', 'time_bin', 'bin_start'
            ]
        )
    if cities_cum_master:
        for city_id, total in cumulative_city_counts.items():
            if city_id in all_mentioned_cities:
                city_info = None
                for lookup_entry in city_lookup.values():
                    if lookup_entry['geonameid'] == city_id:
                        city_info = lookup_entry
                        break
                if city_info:
                    city_cumulative_rows.append({
                        'city_name': city_info['name'],
                        'city_id': city_id,
                        'population': city_info['population'],
                        'latitude': city_info['latitude'],
                        'longitude': city_info['longitude'],
                        'cumulative_count': total
                    })
        write_city_csv(
            os.path.join(cumulative_dir, 'cities_CUMULATIVE_ALL.csv'),
            city_cumulative_rows,
            ['city_name', 'city_id', 'population', 'latitude', 'longitude', 'cumulative_count']
        )

    if states_inc_master:
        arcpy.conversion.FeaturesToJSON(states_inc_master, os.path.join(incremental_dir, 'states_INCREMENTAL_ALL.geojson'), geoJSON='GEOJSON')
    if counties_inc_master:
        arcpy.conversion.FeaturesToJSON(counties_inc_master, os.path.join(incremental_dir, 'counties_INCREMENTAL_ALL.geojson'), geoJSON='GEOJSON')
    if cities_inc_master:
        arcpy.conversion.FeaturesToJSON(cities_inc_master, os.path.join(incremental_dir, 'cities_INCREMENTAL_ALL.geojson'), geoJSON='GEOJSON')

    if states_cum_master:
        arcpy.conversion.FeaturesToJSON(states_cum_master, os.path.join(cumulative_dir, 'states_CUMULATIVE_ALL.geojson'), geoJSON='GEOJSON')
    if counties_cum_master:
        arcpy.conversion.FeaturesToJSON(counties_cum_master, os.path.join(cumulative_dir, 'counties_CUMULATIVE_ALL.geojson'), geoJSON='GEOJSON')
    if cities_cum_master:
        arcpy.conversion.FeaturesToJSON(cities_cum_master, os.path.join(cumulative_dir, 'cities_CUMULATIVE_ALL.geojson'), geoJSON='GEOJSON')

    return {
        'incremental': {
            'states': all_states_incremental,
            'counties': all_counties_incremental,
            'cities': all_cities_incremental
        },
        'cumulative': {
            'states': all_states_cumulative,
            'counties': all_counties_cumulative,
            'cities': all_cities_cumulative
        },
        'master': {
            'states_incremental': states_inc_master,
            'counties_incremental': counties_inc_master,
            'cities_incremental': cities_inc_master,
            'states_cumulative': states_cum_master,
            'counties_cumulative': counties_cum_master,
            'cities_cumulative': cities_cum_master
        }
    }


In [53]:
def print_top_entities(summary, label, count_field='tweet_count', name_field=None, top_n=10):
    print(f"Top {label} by mentions")
    print('-' * 40)
    sorted_rows = sorted(summary, key=lambda row: row.get(count_field, 0), reverse=True)
    for row in sorted_rows[:top_n]:
        name = row.get(name_field) if name_field else row.get(label.lower() + '_code', '')
        print(f"{name}: {row.get(count_field, 0)}")


In [55]:
print("Loading datasets...")
tweets_fc = load_tweet_feature_class()
states_fc = load_states_feature_class()
counties_fc = load_counties_feature_class()
cities_fc = load_cities_feature_class()

print("Creating lookup dictionaries...")
state_lookup, county_lookup, city_lookup = create_lookup_dictionaries(states_fc, counties_fc, cities_fc)

print("Reading tweet records...")
tweet_records = load_tweet_records(tweets_fc)
print(f"Tweets loaded: {len(tweet_records)}")

print("Assigning time bins...")
time_bins = assign_time_bins(tweet_records)
print(f"Time bins: {len(time_bins)}")

print("Counting overall mentions...")
(state_mentions, county_mentions, city_mentions,
 state_details, county_details, city_details) = count_mentions_in_tweets(
    tweet_records, state_lookup, county_lookup, city_lookup
)

(states_summary, counties_summary, cities_summary) = create_overall_summaries(
    state_mentions, county_mentions, city_mentions,
    state_details, county_details, city_details,
    city_lookup
)

print_top_entities(states_summary, 'States', count_field='tweet_count', name_field='state_code')
print_top_entities(counties_summary, 'Counties', count_field='tweet_count', name_field='county_fips')
print_top_entities(cities_summary, 'Cities', count_field='tweet_count', name_field='city_id')

print("Counting temporal mentions...")
(
    temporal_state_mentions,
    temporal_county_mentions,
    temporal_city_mentions,
    temporal_state_details,
    temporal_county_details,
    temporal_city_details
) = count_mentions_in_tweets_temporal(
    tweet_records,
    time_bins,
    state_lookup,
    county_lookup,
    city_lookup
)

print("Creating temporal aggregations...")
temporal_data = create_temporal_aggregations(
    time_bins,
    temporal_state_mentions,
    temporal_county_mentions,
    temporal_city_mentions,
    temporal_state_details,
    temporal_county_details,
    temporal_city_details
)

print("Exporting temporal outputs for ArcGIS Pro...")
export_results = export_temporal_to_arcgis(
    temporal_data,
    time_bins,
    states_fc,
    counties_fc,
    cities_fc,
    city_lookup,
    output_dir=PROJECT_OUTPUT_FOLDER
)

print("Export complete. Output paths summary:")
print(json.dumps(export_results, indent=2, default=str))


Loading datasets...


<class 'arcgisscripting.ExecuteError'>: Failed to execute. Parameters are not valid.
ERROR 000517: The coordinate system is not defined for the input dataset.
Failed to execute (Project).
