In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import folium
from click.formatting import iter_rows
from folium import plugins
import json
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import folium
from folium import plugins
import json
pd.set_option('display.max_columns', None)



def get_project_root():
    # This assumes the notebook is running from within the project's root folder.
    return os.getcwd()

def get_data_file_path(*path_segments):
    project_root = get_project_root()
    # Navigate to the data folder from the project root.
    return os.path.join(project_root, *path_segments)

# Data loading functions
def get_geojson():
    """Get path to helene.geojson"""
    geojson = get_data_file_path('data', 'geojson', 'helene.geojson')
    print(geojson)
    return gpd.read_file(geojson)

def get_cities():
    df_path = get_data_file_path('data', 'tables', 'cities1000.csv')
    # df = pd.read_csv(df_path)
    df = pd.read_csv(df_path, low_memory=False)

    us_cities_df = df[
        (df['country_code'] == 'US') &
        (df['feature_class'] == 'P') &
        (df['population'].notna()) &
        (df['latitude'].notna()) &
        (df['longitude'].notna())
        ].reset_index(drop=True)

    us_cities_gdf = gpd.GeoDataFrame(
        us_cities_df,
        geometry=gpd.points_from_xy(us_cities_df.longitude, us_cities_df.latitude),
        crs="EPSG:4326"
    )
    return us_cities_gdf

def get_states():
    gdf_path = get_data_file_path('data', 'shape_files', "cb_2023_us_state_20m.shp")
    return gpd.read_file(gdf_path)

def get_counties():
    gdf_path = get_data_file_path('data', 'shape_files', "cb_2023_us_county_20m.shp")
    return gpd.read_file(gdf_path)

tweets_gdf = get_geojson().to_crs("EPSG:4326")
us_cities_gdf = get_cities().to_crs("EPSG:4326")
us_states_gdf = get_states().to_crs("EPSG:4326")
us_counties_gdf = get_counties().to_crs("EPSG:4326")

C:\Users\colto\Documents\GitHub\Tweet_project\data\geojson\helene.geojson


In [2]:
# ==============================================================================
# NEW APPROACH: Count tweets by GPE/FAC mentions ONLY (ignore lat/lon)
# ==============================================================================

from fuzzywuzzy import fuzz, process
import re

def preprocess_place_name(name):
    """Standardize place names for matching"""
    if pd.isna(name) or name == 'NAN' or name == '':
        return None
    name = str(name).upper().strip()
    name = re.sub(r'\bST\.?\b', 'SAINT', name)
    name = re.sub(r'\bMT\.?\b', 'MOUNT', name)
    name = re.sub(r'\bFT\.?\b', 'FORT', name)
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    return name.strip()

def parse_gpe_entities(gpe_string):
    """Split GPE field into individual place mentions"""
    if not gpe_string or pd.isna(gpe_string) or str(gpe_string).strip() == '':
        return []
    gpe_string = str(gpe_string).strip()
    entities = []
    for part in [p.strip() for p in gpe_string.split(',')]:
        if not part:
            continue
        for sub in re.split(r'[;&|]', part):
            sub = preprocess_place_name(sub)
            if sub and len(sub) > 1:
                entities.append(sub)
    # Remove duplicates while preserving order
    seen, clean = set(), []
    for e in entities:
        if e not in seen:
            clean.append(e)
            seen.add(e)
    return clean

def create_lookup_dictionaries(states_gdf, counties_gdf, cities_gdf):
    """Build name->geometry lookup dictionaries"""
    print("Building lookup dictionaries...")
    
    # States
    state_lookup = {}
    state_abbrev_to_name = {}
    for _, row in states_gdf.iterrows():
        name = preprocess_place_name(row['NAME'])
        if name:
            state_lookup[name] = row
        if 'STUSPS' in row:
            abbr = str(row['STUSPS']).upper()
            state_abbrev_to_name[abbr] = name
            state_lookup[abbr] = row
    
    # Counties
    county_lookup = {}
    for _, row in counties_gdf.iterrows():
        name = preprocess_place_name(row['NAME'])
        if name:
            county_lookup[name] = row
    
    # Cities
    city_lookup = {}
    for _, row in cities_gdf.iterrows():
        name = preprocess_place_name(row['name'])
        if name:
            city_lookup[name] = row
    
    print(f"  States: {len(state_lookup)}")
    print(f"  Counties: {len(county_lookup)}")
    print(f"  Cities: {len(city_lookup)}")
    
    return state_lookup, county_lookup, city_lookup, state_abbrev_to_name

def fuzzy_match_entity(entity, lookup_dict, threshold=85):
    """Fuzzy match entity to lookup dictionary"""
    if entity in lookup_dict:
        return lookup_dict[entity], 100
    
    names = list(lookup_dict.keys())
    if not names:
        return None, 0
    
    match = process.extractOne(entity, names, scorer=fuzz.ratio)
    if match and match[1] >= threshold:
        return lookup_dict[match[0]], match[1]
    
    return None, 0

def count_mentions_in_tweets(tweets_gdf, state_lookup, county_lookup, city_lookup):
    """
    Count tweets by what they MENTION, not where they are located.
    
    Rules:
    - If tweet mentions "Texas" → Texas state gets +1
    - If tweet mentions "Houston" → Houston city gets +1
    - If tweet mentions "Harris County" → Harris County gets +1
    - All mentions in a single tweet count independently
    """
    print("\nCounting tweet mentions...")
    
    state_mentions = {}
    county_mentions = {}
    city_mentions = {}
    
    for idx, row in tweets_gdf.iterrows():
        if idx % 100 == 0:
            print(f"  Processing tweet {idx}/{len(tweets_gdf)}")
        
        # Parse GPE field
        entities = parse_gpe_entities(row['GPE'])
        
        for entity in entities:
            # Try to match to state
            state_match, state_score = fuzzy_match_entity(entity, state_lookup, threshold=90)
            if state_match is not None:
                state_code = state_match['STUSPS']
                state_mentions[state_code] = state_mentions.get(state_code, 0) + 1
                continue
            
            # Try to match to county
            county_match, county_score = fuzzy_match_entity(entity, county_lookup, threshold=85)
            if county_match is not None:
                county_id = county_match['GEOID']
                county_mentions[county_id] = county_mentions.get(county_id, 0) + 1
                continue
            
            # Try to match to city
            city_match, city_score = fuzzy_match_entity(entity, city_lookup, threshold=85)
            if city_match is not None:
                city_id = city_match['geonameid']
                city_mentions[city_id] = city_mentions.get(city_id, 0) + 1
    
    print(f"\n  Found mentions:")
    print(f"    States: {len(state_mentions)}")
    print(f"    Counties: {len(county_mentions)}")
    print(f"    Cities: {len(city_mentions)}")
    
    return state_mentions, county_mentions, city_mentions

def create_count_gdfs(state_mentions, county_mentions, city_mentions, 
                      us_states_gdf, us_counties_gdf, us_cities_gdf):
    """Create GeoDataFrames with mention counts"""
    
    # States
    state_counts_df = pd.DataFrame([
        {'STUSPS': k, 'tweet_count': v} 
        for k, v in state_mentions.items()
    ])
    states_with_counts = us_states_gdf.merge(
        state_counts_df, on='STUSPS', how='left'
    )
    states_with_counts['tweet_count'] = states_with_counts['tweet_count'].fillna(0)
    
    # Counties
    county_counts_df = pd.DataFrame([
        {'GEOID': k, 'tweet_count': v} 
        for k, v in county_mentions.items()
    ])
    counties_with_counts = us_counties_gdf.merge(
        county_counts_df, on='GEOID', how='left'
    )
    counties_with_counts['tweet_count'] = counties_with_counts['tweet_count'].fillna(0)
    
    # Cities
    city_counts_df = pd.DataFrame([
        {'geonameid': k, 'tweet_count': v} 
        for k, v in city_mentions.items()
    ])
    cities_with_counts = us_cities_gdf.merge(
        city_counts_df, on='geonameid', how='left'
    )
    cities_with_counts['tweet_count'] = cities_with_counts['tweet_count'].fillna(0)
    
    return states_with_counts, counties_with_counts, cities_with_counts

# Execute the new approach
state_lookup, county_lookup, city_lookup, state_abbrev_to_name = create_lookup_dictionaries(
    us_states_gdf, us_counties_gdf, us_cities_gdf
)

state_mentions, county_mentions, city_mentions = count_mentions_in_tweets(
    tweets_gdf, state_lookup, county_lookup, city_lookup
)

states_with_counts, counties_with_counts, cities_with_counts = create_count_gdfs(
    state_mentions, county_mentions, city_mentions,
    us_states_gdf, us_counties_gdf, us_cities_gdf
)

print("\nTop states by mentions:")
print(states_with_counts[states_with_counts['tweet_count'] > 0][['NAME', 'STUSPS', 'tweet_count']].sort_values('tweet_count', ascending=False).head(10))

print("\nTop counties by mentions:")
print(counties_with_counts[counties_with_counts['tweet_count'] > 0][['NAME', 'GEOID', 'tweet_count']].sort_values('tweet_count', ascending=False).head(10))

print("\nTop cities by mentions:")
print(cities_with_counts[cities_with_counts['tweet_count'] > 0][['name', 'geonameid', 'tweet_count']].sort_values('tweet_count', ascending=False).head(10))

Building lookup dictionaries...
  States: 104
  Counties: 1915
  Cities: 12256

Counting tweet mentions...
  Processing tweet 0/3007
  Processing tweet 100/3007
  Processing tweet 200/3007
  Processing tweet 300/3007
  Processing tweet 400/3007
  Processing tweet 500/3007
  Processing tweet 600/3007
  Processing tweet 700/3007
  Processing tweet 800/3007
  Processing tweet 900/3007
  Processing tweet 1000/3007
  Processing tweet 1100/3007
  Processing tweet 1200/3007
  Processing tweet 1300/3007
  Processing tweet 1400/3007
  Processing tweet 1500/3007
  Processing tweet 1600/3007
  Processing tweet 1700/3007
  Processing tweet 1800/3007
  Processing tweet 1900/3007
  Processing tweet 2000/3007
  Processing tweet 2100/3007
  Processing tweet 2200/3007
  Processing tweet 2300/3007
  Processing tweet 2400/3007
  Processing tweet 2500/3007
  Processing tweet 2600/3007
  Processing tweet 2700/3007
  Processing tweet 2800/3007
  Processing tweet 2900/3007
  Processing tweet 3000/3007

  Fou

In [3]:
def count_mentions_in_tweets_temporal(tweets_gdf, state_lookup, county_lookup, city_lookup):
    """
    Count mentions by time bin.
    Returns dictionaries: {time_bin: {entity_id: count}}
    """
    print("\nCounting tweet mentions by time bin...")
    
    # Add time binning
    tweets_gdf['time'] = pd.to_datetime(tweets_gdf['time'])
    tweets_gdf['bin'] = tweets_gdf['time'].dt.floor('4h')
    
    time_bins = sorted(tweets_gdf['bin'].unique())
    
    # Initialize dictionaries for each time bin
    temporal_state_mentions = {tb: {} for tb in time_bins}
    temporal_county_mentions = {tb: {} for tb in time_bins}
    temporal_city_mentions = {tb: {} for tb in time_bins}
    
    for idx, row in tweets_gdf.iterrows():
        if idx % 100 == 0:
            print(f"  Processing tweet {idx}/{len(tweets_gdf)}")
        
        time_bin = row['bin']
        entities = parse_gpe_entities(row['GPE'])
        
        for entity in entities:
            # Try state match
            state_match, state_score = fuzzy_match_entity(entity, state_lookup, threshold=90)
            if state_match is not None:
                state_code = state_match['STUSPS']
                temporal_state_mentions[time_bin][state_code] = temporal_state_mentions[time_bin].get(state_code, 0) + 1
                continue
            
            # Try county match
            county_match, county_score = fuzzy_match_entity(entity, county_lookup, threshold=85)
            if county_match is not None:
                county_id = county_match['GEOID']
                temporal_county_mentions[time_bin][county_id] = temporal_county_mentions[time_bin].get(county_id, 0) + 1
                continue
            
            # Try city match
            city_match, city_score = fuzzy_match_entity(entity, city_lookup, threshold=85)
            if city_match is not None:
                city_id = city_match['geonameid']
                temporal_city_mentions[time_bin][city_id] = temporal_city_mentions[time_bin].get(city_id, 0) + 1
    
    return time_bins, temporal_state_mentions, temporal_county_mentions, temporal_city_mentions

def create_temporal_aggregations(time_bins, temporal_state_mentions, temporal_county_mentions, temporal_city_mentions):
    """Create aggregated counts for each time bin"""
    temporal_data = {}
    
    for bin_time in time_bins:
        # Convert mention dictionaries to DataFrames
        state_counts = pd.DataFrame([
            {'state_code': k, 'tweet_count': v}
            for k, v in temporal_state_mentions[bin_time].items()
        ])
        
        county_counts = pd.DataFrame([
            {'county_fips': k, 'tweet_count': v}
            for k, v in temporal_county_mentions[bin_time].items()
        ])
        
        city_counts = pd.DataFrame([
            {'city_id': k, 'tweet_count': v}
            for k, v in temporal_city_mentions[bin_time].items()
        ])
        
        temporal_data[bin_time] = {
            'states': state_counts,
            'counties': county_counts,
            'cities': city_counts
        }
    
    return temporal_data

def prepare_timeslider_data_correct(temporal_data, boundary_gdf, join_left, join_right, level_name):
    """
    Convert geometry to proper GeoJSON format with lists, not tuples
    """
    timeslider_data = []

    for bin_time, counts_data in temporal_data.items():
        bin_gdf = boundary_gdf.merge(
            counts_data[level_name],
            left_on=join_left,
            right_on=join_right,
            how='left'
        )

        bin_gdf['tweet_count'] = bin_gdf['tweet_count'].fillna(0)
        bin_gdf = bin_gdf[bin_gdf.geometry.notna()]
        bin_gdf = bin_gdf[bin_gdf.geometry.is_valid]

        if bin_gdf.crs is None:
            bin_gdf = bin_gdf.set_crs("EPSG:4326")
        else:
            bin_gdf = bin_gdf.to_crs("EPSG:4326")

        geojson_str = bin_gdf.to_json()
        geojson_dict = json.loads(geojson_str)

        timestamp_str = bin_time.strftime('%Y-%m-%dT%H:%M:%S')
        for feature in geojson_dict['features']:
            if feature['properties'] is None:
                feature['properties'] = {}
            feature['properties']['time'] = timestamp_str
            if 'tweet_count' not in feature['properties']:
                feature['properties']['tweet_count'] = 0
            feature['properties']['tweet_count'] = float(feature['properties']['tweet_count'])

        timeslider_data.append(geojson_dict)

    return timeslider_data

def prepare_heatmap_with_time(temporal_data, cities_gdf):
    """Prepare data for HeatMapWithTime"""
    heat_data = []
    time_index = []

    for bin_time, counts_data in temporal_data.items():
        bin_cities = cities_gdf.merge(
            counts_data['cities'],
            left_on='geonameid',
            right_on='city_id',
            how='inner'
        )

        bin_heat_data = []
        for _, row in bin_cities.iterrows():
            if pd.notna(row.latitude) and pd.notna(row.longitude) and row.tweet_count > 0:
                bin_heat_data.append([
                    float(row.latitude),
                    float(row.longitude),
                    float(row.tweet_count)
                ])

        heat_data.append(bin_heat_data)
        time_index.append(bin_time.strftime('%Y-%m-%d %H:%M'))

    return heat_data, time_index

In [8]:
def folium_process_dynamic(tweets_gdf, us_states_gdf, us_counties_gdf, us_cities_gdf, 
                           state_lookup, county_lookup, city_lookup):
    """Process data for dynamic temporal visualization using mention-based counting"""
    
    # Count mentions by time bin
    time_bins, temporal_state_mentions, temporal_county_mentions, temporal_city_mentions = \
        count_mentions_in_tweets_temporal(tweets_gdf, state_lookup, county_lookup, city_lookup)
    
    # Create temporal aggregations
    temporal_data = create_temporal_aggregations(
        time_bins, temporal_state_mentions, temporal_county_mentions, temporal_city_mentions
    )
    
    # Prepare timeslider data for states and counties
    state_timeslider_data = prepare_timeslider_data_correct(
        temporal_data,
        us_states_gdf,
        'STUSPS',
        'state_code',
        'states'
    )
    
    county_timeslider_data = prepare_timeslider_data_correct(
        temporal_data,
        us_counties_gdf,
        'GEOID',
        'county_fips',
        'counties'
    )
    
    # Prepare heatmap data for cities
    city_heat_data, time_labels = prepare_heatmap_with_time(temporal_data, us_cities_gdf)
    
    return state_timeslider_data, county_timeslider_data, city_heat_data, time_labels

# Execute the temporal processing
state_timeslider_data, county_timeslider_data, city_heat_data, time_labels = folium_process_dynamic(
    tweets_gdf, us_states_gdf, us_counties_gdf, us_cities_gdf,
    state_lookup, county_lookup, city_lookup
)

print(f"\nTemporal data prepared:")
print(f"  Time bins: {len(state_timeslider_data)}")
print(f"  State timeslider features: {len(state_timeslider_data)}")
print(f"  County timeslider features: {len(county_timeslider_data)}")
print(f"  City heatmap time steps: {len(city_heat_data)}")
print(state_timeslider_data[:10])


Counting tweet mentions by time bin...
  Processing tweet 0/3007
  Processing tweet 100/3007
  Processing tweet 200/3007
  Processing tweet 300/3007
  Processing tweet 400/3007
  Processing tweet 500/3007
  Processing tweet 600/3007
  Processing tweet 700/3007
  Processing tweet 800/3007
  Processing tweet 900/3007
  Processing tweet 1000/3007
  Processing tweet 1100/3007
  Processing tweet 1200/3007
  Processing tweet 1300/3007
  Processing tweet 1400/3007
  Processing tweet 1500/3007
  Processing tweet 1600/3007
  Processing tweet 1700/3007
  Processing tweet 1800/3007
  Processing tweet 1900/3007
  Processing tweet 2000/3007
  Processing tweet 2100/3007
  Processing tweet 2200/3007
  Processing tweet 2300/3007
  Processing tweet 2400/3007
  Processing tweet 2500/3007
  Processing tweet 2600/3007
  Processing tweet 2700/3007
  Processing tweet 2800/3007
  Processing tweet 2900/3007
  Processing tweet 3000/3007


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

