In [47]:
import geopandas as gpd
import pandas as pd
import os
pd.set_option('display.max_columns', None)

def get_project_root():
    """
    Gets the absolute path to the project's root folder.
    This version works in a Jupyter Notebook assuming the notebook is in a '/src' subdirectory.
    """
    # Get the current working directory of the notebook
    notebook_path = os.getcwd()

    # If the notebook is in 'src', go up one level to the project root
    if os.path.basename(notebook_path) == 'src':
        return os.path.dirname(notebook_path)

    # Otherwise, assume the notebook is already in the project root
    return notebook_path
def get_data_file_path(*path_segments):
    """Builds a full path to any data file from the project root."""
    project_root = get_project_root()
    return os.path.join(project_root, *path_segments)


def get_geojson(label):
    """Get path to helene.geojson"""
    geojson = get_data_file_path('data', 'geojson', f'{label}.geojson')
    return gpd.read_file(geojson)


def get_cities():
    """
    Load US cities data, starting with GeoJSON and supplementing with CSV data.
    Combines both sources to maximize data coverage.
    """
    cities_gdf = None

    # Try to load GeoJSON first
    try:
        geojson_path = get_data_file_path('data', 'geojson', f'us_cities.geojson')
        cities_gdf = gpd.read_file(geojson_path, dtype={13: str})

        if cities_gdf.crs is None:
            cities_gdf = cities_gdf.set_crs("EPSG:4326")

        print(f"✓ Loaded {len(cities_gdf)} cities from GeoJSON")

    except (FileNotFoundError, Exception) as e:
        print(f"⚠ GeoJSON loading failed ({e})")

    # Load CSV data
    try:
        df_path = get_data_file_path('data', 'tables', 'cities1000.csv')
        df = pd.read_csv(df_path, dtype={13: str})
        csv_cities_df = df[
            (df['country_code'] == 'US') &
            (df['feature_class'] == 'P') &
            (df['population'].notna()) &
            (df['latitude'].notna()) &
            (df['longitude'].notna())
            ].reset_index(drop=True)

        csv_cities_gdf = gpd.GeoDataFrame(
            csv_cities_df,
            geometry=gpd.points_from_xy(csv_cities_df.longitude, csv_cities_df.latitude),
            crs="EPSG:4326"
        )

        print(f"✓ Loaded {len(csv_cities_gdf)} cities from CSV")

        # If we have both, supplement GeoJSON with missing CSV cities
        if cities_gdf is not None:
            # Find missing cities (by geonameid if available, otherwise by name)
            if 'geonameid' in cities_gdf.columns and 'geonameid' in csv_cities_gdf.columns:
                existing_ids = set(cities_gdf['geonameid'])
                missing_cities = csv_cities_gdf[~csv_cities_gdf['geonameid'].isin(existing_ids)]
            else:
                # Fall back to name-based comparison
                existing_names = set(cities_gdf['name']) if 'name' in cities_gdf.columns else set()
                missing_cities = csv_cities_gdf[~csv_cities_gdf['name'].isin(existing_names)]

            if len(missing_cities) > 0:
                # Combine datasets
                cities_gdf = pd.concat([cities_gdf, missing_cities], ignore_index=True)
                print(f"✓ Added {len(missing_cities)} supplemental cities from CSV")
        else:

            # If GeoJSON failed, use CSV only
            cities_gdf = csv_cities_gdf

    except Exception as e:
        if cities_gdf is None:
            raise Exception(f"Failed to load both GeoJSON and CSV: {e}")
        print(f"⚠ CSV supplementation failed: {e}")

    print(f"✓ Total cities loaded: {len(cities_gdf)}")
    return cities_gdf


def get_states():
    gdf_path = get_data_file_path('data', 'shape_files', "cb_2023_us_state_20m.shp")
    return gpd.read_file(gdf_path)


def get_counties():
    gdf_path = get_data_file_path('data', 'shape_files', "cb_2023_us_county_20m.shp")
    return gpd.read_file(gdf_path)



In [48]:
tweets_gdf_helene = get_geojson('helene').to_crs("EPSG:4326")
us_cities_gdf_helene = get_cities().to_crs("EPSG:4326")
us_states_gdf_helene = get_states().to_crs("EPSG:4326")
us_counties_gdf_helene = get_counties().to_crs("EPSG:4326")

tweets_gdf_francine = get_geojson('francine').to_crs("EPSG:4326")
us_cities_gdf_francine = get_cities().to_crs("EPSG:4326")
us_states_gdf_francine = get_states().to_crs("EPSG:4326")
us_counties_gdf_francine = get_counties().to_crs("EPSG:4326")

  return ogr_read(


✓ Loaded 7471 cities from GeoJSON
✓ Loaded 17244 cities from CSV
✓ Added 9773 supplemental cities from CSV
✓ Total cities loaded: 17244


  return ogr_read(


✓ Loaded 7471 cities from GeoJSON
✓ Loaded 17244 cities from CSV
✓ Added 9773 supplemental cities from CSV
✓ Total cities loaded: 17244


In [49]:
projected_crs = "EPSG:3857"

tweets_with_states_helene = gpd.sjoin(tweets_gdf_helene, us_states_gdf_helene, predicate='within', lsuffix='_tweet', rsuffix='_state')
tweets_with_counties_helene = gpd.sjoin(tweets_with_states_helene, us_counties_gdf_helene, predicate='within', lsuffix='_tweet',
                                 rsuffix='_county')
tweets_proj_helene = tweets_with_counties_helene.to_crs(projected_crs)
cities_proj_helene = us_cities_gdf_helene.to_crs(projected_crs)
tweets_with_cities_proj_helene = gpd.sjoin_nearest(
    tweets_proj_helene, 
    cities_proj_helene, 
    max_distance=15000,  # <-- IMPORTANT: Distance is now in meters
    distance_col='distance_to_city')

tweets_with_states_francine = gpd.sjoin(tweets_gdf_francine, us_states_gdf_francine, predicate='within', lsuffix='_tweet', rsuffix='_state')
tweets_with_counties_francine = gpd.sjoin(tweets_with_states_francine, us_counties_gdf_francine, predicate='within', lsuffix='_tweet',
                                 rsuffix='_county')
tweets_proj_francine = tweets_with_counties_francine.to_crs(projected_crs)
cities_proj_francine = us_cities_gdf_francine.to_crs(projected_crs)
tweets_with_cities_proj_francine = gpd.sjoin_nearest(
    tweets_proj_francine, 
    cities_proj_francine, 
    max_distance=15000,  # <-- IMPORTANT: Distance is now in meters
    distance_col='distance_to_city')

In [50]:
def clean_and_select_columns(tweets_with_cities):
    """Select and rename essential columns"""
    cleaned = tweets_with_cities[[
        'FAC', 'LOC', 'GPE', 'time', 'Latitude', 'Longitude',
        'STUSPS__tweet', 'NAME__tweet', 'NAME__county', 'GEOID__county',
        'name', 'geonameid', 'population'
    ]].copy()

    cleaned = cleaned.rename(columns={
        'STUSPS__tweet': 'state_code',
        'NAME__tweet': 'state_name',
        'NAME__county': 'county_name',
        'GEOID__county': 'county_fips',
        'name': 'city_name',
        'geonameid': 'city_id'
    })

    return cleaned

final_tweets_helene = clean_and_select_columns(tweets_with_cities_proj_helene)
final_tweets_francine = clean_and_select_columns(tweets_with_cities_proj_francine)

In [51]:
def create_temporal_aggregations(tweets_df, time_bins):
    """Create aggregated counts for each time bin"""
    temporal_data = {}

    for bin_time in time_bins:
        bin_tweets = tweets_df[tweets_df['bin'] == bin_time]

        state_counts = bin_tweets.groupby('state_code').size().reset_index(name='tweet_count')
        county_counts = bin_tweets.groupby('county_fips').size().reset_index(name='tweet_count')
        city_counts = bin_tweets.groupby('city_id').size().reset_index(name='tweet_count')

        temporal_data[bin_time] = {
            'states': state_counts,
            'counties': county_counts,
            'cities': city_counts
        }
    return temporal_data

def _get_join_cols(level_name: str):
    """
    Return (join_col_on_geometry, data_col_on_temporal_counts)
    """
    if level_name == 'states':
        return 'STUSPS', 'state_code'
    elif level_name == 'counties':
        return 'GEOID', 'county_fips'
    elif level_name == 'cities':
        return 'geonameid', 'city_id'
    else:
        raise ValueError(f"Unknown level_name: {level_name}")

def create_long_format_shapefile(temporal_data, gdf, output_path, level_name='states'):
    """
    Option 2: Create shapefile with repeated geometries (long format)
    Each geometry appears once per time period

    Args:
        temporal_data: Your temporal aggregation data
        gdf: GeoDataFrame (states or counties)
        output_path: Where to save the shapefile
        level_name: 'states' or 'counties'
    """
    all_records = []

    # Get the appropriate join columns
    join_col, data_col = _get_join_cols(level_name)
    # For each time period, create records
    for bin_time, counts_data in temporal_data.items():
        time_counts = counts_data[level_name]

        # Merge with GeoDataFrame
        merged = gdf.merge(
            time_counts,
            left_on=join_col,
            right_on=data_col,
            how='left'
        )

        # Fill NaN tweet counts with 0
        merged['tweet_count'] = merged['tweet_count'].fillna(0)

        # Add timestamp columns
        merged['timestamp'] = bin_time
        merged['time_str'] = bin_time.strftime('%Y-%m-%d %H:%M')
        merged['unix_time'] = int(bin_time.timestamp())

        # Keep essential columns + geometry
        essential_cols = [join_col, 'NAME', 'geometry', 'timestamp', 'time_str', 'unix_time', 'tweet_count']
        if level_name == 'counties':
            essential_cols.append('STATEFP')  # State FIPS for counties

        # Filter to existing columns
        available_cols = [col for col in essential_cols if col in merged.columns]
        merged_clean = merged[available_cols].copy()

        all_records.append(merged_clean)

    # Combine all time periods
    result_gdf = pd.concat(all_records, ignore_index=True)

    # Clean column names for shapefile
    result_gdf = clean_shapefile_columns(result_gdf)
    # result_gdf = result_gdf[result_gdf['tweets'] > 0]
    # Save as shapefile
    result_gdf.to_file(output_path)
    print(f"Long format shapefile saved: {output_path}")
    print(f"Total records: {len(result_gdf)} (geometries × time periods)")

    return result_gdf
def clean_shapefile_columns(gdf):
    """
    Clean column names to be shapefile-compatible
    Shapefiles have 10-character field name limits
    """
    result = gdf.copy()

    # Rename long column names
    rename_dict = {}
    for col in result.columns:
        if col == 'geometry':
            continue
        if len(col) > 10:
            # Create shortened version
            if 'tweet_count' in col:
                rename_dict[col] = 'tweets'
            elif 'timestamp' in col:
                rename_dict[col] = 'time_stamp'
            elif col.startswith('t_'):
                rename_dict[col] = col[:10]  # Keep first 10 chars
            else:
                rename_dict[col] = col[:10]

    if rename_dict:
        result = result.rename(columns=rename_dict)

    return result
def convert_temporal_data_to_shapefiles(final_tweets, us_states_gdf, us_counties_gdf,us_cities_gdf, label):
    """
    Main function to convert all your temporal data to shapefiles
    """
    # Prepare temporal data (same as your existing code)
    final_tweets['time'] = pd.to_datetime(final_tweets['time'])
    final_tweets['bin'] = final_tweets['time'].dt.floor('4h')
    time_bins = sorted(final_tweets['bin'].unique())
    temporal_data = create_temporal_aggregations(final_tweets, time_bins)

    # Create output directory
    output_dir = 'temporal_shapefiles_v2'
    os.makedirs(output_dir, exist_ok=True)

    states_long = create_long_format_shapefile(
        temporal_data, us_states_gdf,
        os.path.join(output_dir, f'{label}_states_long_format.shp'),
        'states'
    )

    counties_long = create_long_format_shapefile(
        temporal_data, us_counties_gdf,
        os.path.join(output_dir, f'{label}_counties_long_format.shp'),
        'counties'
    )
    cities_long   = create_long_format_shapefile(temporal_data, us_cities_gdf,
                       os.path.join(output_dir, f'{label}_cities_long_v2.shp'),   'cities')
convert_temporal_data_to_shapefiles(final_tweets_helene, us_states_gdf_helene, us_counties_gdf_helene, us_cities_gdf_helene, 'helene')
convert_temporal_data_to_shapefiles(final_tweets_francine, us_states_gdf_francine, us_counties_gdf_francine, us_cities_gdf_francine, 'francine')


Long format shapefile saved: temporal_shapefiles_v2\helene_states_long_format.shp
Total records: 572 (geometries × time periods)


  ogr_write(
  ogr_write(


Long format shapefile saved: temporal_shapefiles_v2\helene_counties_long_format.shp
Total records: 35442 (geometries × time periods)
Long format shapefile saved: temporal_shapefiles_v2\helene_cities_long_v2.shp

  ogr_write(



Total records: 189684 (geometries × time periods)
Long format shapefile saved: temporal_shapefiles_v2\francine_states_long_format.shp

  ogr_write(



Total records: 2080 (geometries × time periods)


  ogr_write(


Long format shapefile saved: temporal_shapefiles_v2\francine_counties_long_format.shp
Total records: 128880 (geometries × time periods)


  ogr_write(


Long format shapefile saved: temporal_shapefiles_v2\francine_cities_long_v2.shp
Total records: 689760 (geometries × time periods)
