In [None]:
import os
import requests
import zipfile
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import json
import matplotlib.colors as mcolors
from shapely.geometry import Polygon
import matplotlib.patches as mpatches



# Directory to store shapefiles
shapefile_dir = "shapefile"
os.makedirs(shapefile_dir, exist_ok=True)

# URLs for individual files in the GitHub repository and UACE Census TIGER data
natural_earth_files = {
    "ne_10m_admin_0_countries": [
        "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_0_countries.shp",
        "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_0_countries.shx",
        "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_0_countries.dbf",
        "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_0_countries.prj",
    ],
    "ne_10m_admin_1_states_provinces": [
        "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_1_states_provinces.shp",
        "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_1_states_provinces.shx",
        "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_1_states_provinces.dbf",
        "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/10m_cultural/ne_10m_admin_1_states_provinces.prj",
    ]
}

uace_url = "https://www2.census.gov/geo/tiger/TIGER2024/UAC20/tl_2024_us_uac20.zip"
uace_zip_path = os.path.join(shapefile_dir, "tl_2024_us_uac20.zip")

# Function to download each file
def download_files(file_urls):
    for url in file_urls:
        filename = os.path.join(shapefile_dir, url.split('/')[-1])
        if not os.path.exists(filename):
            print(f"Downloading {filename}...")
            response = requests.get(url)
            response.raise_for_status()
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {filename}.")
        else:
            print(f"{filename} already exists.")

# Function to download and extract the UACE ZIP file
def download_and_extract_uace(url, zip_path):
    if not os.path.exists(zip_path):
        print(f"Downloading UACE data from {url}...")
        response = requests.get(url)
        response.raise_for_status()
        with open(zip_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {zip_path}.")
    
    # Extract UACE shapefile
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(shapefile_dir)
    print(f"Extracted UACE shapefiles to {shapefile_dir}.")

# Download Natural Earth files
for file_urls in natural_earth_files.values():
    download_files(file_urls)

# Download and extract UACE shapefile
download_and_extract_uace(uace_url, uace_zip_path)

# Load the shapefiles using the original filenames
world = gpd.read_file(os.path.join(shapefile_dir, "ne_10m_admin_0_countries.shp"))
states_provinces = gpd.read_file(os.path.join(shapefile_dir, "ne_10m_admin_1_states_provinces.shp"))
uace_shapefile = gpd.read_file(os.path.join(shapefile_dir, "tl_2024_us_uac20.shp"))

# Filter for the United States in the world shapefile
us_boundary = world[world['NAME'] == 'United States of America']


def process_state_deltas(state_name, year_ranges, mode_type, metric_column):
    # Determine the modes based on mode_type
    if mode_type == "bus":
        modes = ['MB', 'RB', 'CB']
        title_suffix = "Bus Modes"
    elif mode_type == "train":
        modes = ['LR', 'HR', 'YR', 'CR']
        title_suffix = "Train Modes"
    else:
        raise ValueError("Invalid mode_type. Use 'bus' or 'train'.")

    # Define the column title for plot labels based on the metric
    # CHANGE TO WH/KM: If we're dealing with "All Fuels (Wh/pkm)," rename to Wh/km
    if metric_column == "All Fuels (Wh/pkm)":
        metric_title = "Fuel Efficiency (Wh/km)"
    elif metric_column == "Average Passengers":
        metric_title = "Average Passengers"
    else:
        metric_title = metric_column

    def load_year_data(year):
        json_file = f"../src/emcommon/resources/ntd{year}_intensities.json"
        with open(json_file, 'r') as f:
            data = json.load(f)
        df = pd.DataFrame(data['records'])

        mode_df = df[df['Mode'].isin(modes)]

        mode_df[metric_column] = pd.to_numeric(mode_df[metric_column], errors='coerce')
        mode_df['Unlinked Passenger Trips'] = pd.to_numeric(mode_df['Unlinked Passenger Trips'], errors='coerce')
        mode_df['Average Passengers'] = pd.to_numeric(mode_df['Average Passengers'], errors='coerce')
        mode_df = mode_df.dropna(subset=[metric_column, 'Unlinked Passenger Trips', 'UACE Code', 'Average Passengers'])

        # CHANGE TO WH/KM: Convert Wh/pkm to Wh/km by multiplying by Average Passengers
        if metric_column == "All Fuels (Wh/pkm)":
            mode_df[metric_column] = mode_df[metric_column] * mode_df["Average Passengers"]

        mode_df['UACE Code'] = mode_df['UACE Code'].astype(str).str.zfill(5)
        grouped = mode_df.groupby('UACE Code').apply(
            lambda x: pd.Series({
                f'Weighted {metric_column}': (x[metric_column] * x['Unlinked Passenger Trips']).sum() / x['Unlinked Passenger Trips'].sum(),
            })
        ).reset_index()

        return grouped

    # Determine the number of plots based on the number of year ranges
    num_plots = len(year_ranges)
    figsize = (9 * num_plots, 8)  # Adjust the figure size based on the number of plots

    # Create a figure with the appropriate number of subplots
    fig, axs = plt.subplots(1, num_plots, figsize=figsize, constrained_layout=True)
    if num_plots == 1:
        axs = [axs]  # Ensure axs is always a list for consistency

    fig.subplots_adjust(top=0.88)  # Add space above subplots
    titley = 0.93
    if state_name == "North Carolina":
        titley = 0.76
    elif state_name == "Massachusetts":
        titley = 0.82
    elif state_name == "Colorado":
        titley = 0.86
    fig.suptitle(f"{state_name} {mode_type.title()} - {metric_title}", fontsize=20, y=titley)

    # Define color normalization and color map based on metric and mode type
    if metric_column == "All Fuels (Wh/pkm)":
        norm = mcolors.Normalize(vmin=-800, vmax=800)
    elif metric_column == "Average Passengers":
        if mode_type == "bus":
            norm = mcolors.Normalize(vmin=-4, vmax=4)
        elif mode_type == "train":
            norm = mcolors.Normalize(vmin=-40, vmax=40)
    else:
        norm = mcolors.Normalize(vmin=-4, vmax=4)  # Default case if needed

    if metric_column == "All Fuels (Wh/pkm)":
        cmap = 'RdYlGn_r'
    else:
        cmap = 'RdYlGn'

    # Filter for the specified state
    state = states_provinces[states_provinces['name'] == state_name]
    state_boundary = state.unary_union  # Get the boundary as a single geometry

    for ax, (year_start, year_end) in zip(axs, year_ranges):
        # Load data for both years
        data_start = load_year_data(year_start)
        data_end = load_year_data(year_end)

        # Merge data for both years to calculate the delta
        data_delta = data_start.merge(data_end, on="UACE Code", suffixes=(f'_{year_start}', f'_{year_end}'))
        data_delta[f'Delta {metric_column}'] = data_delta[f'Weighted {metric_column}_{year_end}'] - data_delta[f'Weighted {metric_column}_{year_start}']

        # Merge with UACE shapefile for spatial plotting
        uace_shapefile['GEOID20'] = uace_shapefile['GEOID20'].astype(str).str.zfill(5)
        merged_gdf = uace_shapefile.merge(data_delta, left_on='GEOID20', right_on='UACE Code', how='inner')

        # Identify UACE regions with data and without data (no data)
        uace_with_data = merged_gdf.copy()
        uace_no_data = uace_shapefile[~uace_shapefile['GEOID20'].isin(uace_with_data['GEOID20'])]

        # Perform spatial intersection to clip UACE regions to the state boundary
        clipped_with_data = gpd.overlay(uace_with_data, gpd.GeoDataFrame(geometry=[state_boundary], crs=uace_with_data.crs), how='intersection')
        clipped_no_data = gpd.overlay(uace_no_data, gpd.GeoDataFrame(geometry=[state_boundary], crs=uace_no_data.crs), how='intersection')

        # Plot the delta values for the specified time range
        clipped_with_data.plot(
            ax=ax,
            column=f'Delta {metric_column}',
            cmap=cmap,
            legend=False,
            edgecolor='grey',
            linewidth=0.5,
            norm=norm,
        )

        # Plot the UACE regions with no data using hatch pattern
        clipped_no_data.plot(
            ax=ax,
            color="none",
            edgecolor="grey",
            hatch="////////",
            linewidth=0.5,
        )

        # Overlay state boundary
        state.plot(ax=ax, color="none", edgecolor="black", linewidth=1.5)

        # Set title and bounds for each subplot
        ax.set_title(f"{year_start}-{year_end} Delta", fontsize=16)
        if state_name == "Massachusetts":
            ax.set_xlim([-73.5, -69.9])
            ax.set_ylim([41.2, 42.9])
        elif state_name == "Colorado":
            ax.set_xlim([-109.1, -102.0])
            ax.set_ylim([36.9, 41.0])
        elif state_name == "North Carolina":
            ax.set_xlim([-84.3, -75.5])
            ax.set_ylim([33.8, 36.6])

        ax.set_aspect('equal')

    # Add a single colorbar for the entire figure
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm._A = []  # Dummy array for colorbar
    cbar = fig.colorbar(sm, ax=axs, orientation='vertical', fraction=0.02, pad=0.04, shrink=0.5)
    cbar.set_label(f"Delta {metric_title}")

    # Create a custom legend entry for "No Data" with hatching
    no_data_patch = mpatches.Patch(facecolor="none", edgecolor="grey", hatch="////////", label="No Data")
    if state_name == "North Carolina":
        patch_y = 0.25
    else:
        patch_y = 0.14
    fig.legend(handles=[no_data_patch], loc="lower right", bbox_to_anchor=(0.93, patch_y), fontsize=12)

    # Save and display the combined figure for the state and mode type
    year_range_str = "-".join([f"{start}_{end}" for start, end in year_ranges])
    plt.savefig(f"delta_{metric_column.replace(' ', '_').replace('/', '_').replace('(', '').replace(')', '').lower()}_{title_suffix.replace(' ', '_').lower()}_{state_name.replace(' ', '_').lower()}_{year_range_str}.pdf")

    plt.show()


# Specify states and year ranges for deltas
states = ["Massachusetts", "Colorado", "North Carolina"]
# year_ranges = [(2018, 2020), (2020, 2022)]
year_ranges = [(2018, 2022)]

# Generate delta plots for each state and mode type for both metrics
# for mode_type in ["bus", "train"]:
for mode_type in ["bus"]:
    for state in states:
        process_state_deltas(state, year_ranges, mode_type, "All Fuels (Wh/pkm)")  # For fuel efficiency
        process_state_deltas(state, year_ranges, mode_type, "Average Passengers")  # For average occupancy


In [None]:

def calculate_national_and_uace_average_passengers(years, uace=None, mode_type="bus"):
    """
    Calculate the weighted national average passengers and for a specific UACE.
    
    Parameters:
        years (list): List of years to process.
        uace (str): UACE code to filter on. If None, only calculates national average.
        mode_type (str): Mode type to process (default: 'bus').
    """
    # Define bus modes
    if mode_type == "bus":
        modes = ['MB', 'RB', 'CB']  # Bus modes
    else:
        raise ValueError("Currently only 'bus' mode is supported.")

    # Directory containing the JSON files
    json_dir = "../src/emcommon/resources/"

    # Loop through each year and calculate averages
    for year in years:
        json_file = os.path.join(json_dir, f"ntd{year}_intensities.json")
        if not os.path.exists(json_file):
            print(f"File for year {year} not found: {json_file}")
            continue

        with open(json_file, 'r') as f:
            data = json.load(f)

        # Convert to DataFrame
        df = pd.DataFrame(data['records'])

        # Filter for bus modes
        mode_df = df[df['Mode'].isin(modes)]

        # Ensure numeric columns
        mode_df['Average Passengers'] = pd.to_numeric(mode_df['Average Passengers'], errors='coerce')
        mode_df['Unlinked Passenger Trips'] = pd.to_numeric(mode_df['Unlinked Passenger Trips'], errors='coerce')

        # Drop rows with missing values
        mode_df = mode_df.dropna(subset=['Average Passengers', 'Unlinked Passenger Trips'])

        # Calculate the weighted national average for passengers
        total_passenger_trips = mode_df['Unlinked Passenger Trips'].sum()
        weighted_avg_passengers = (mode_df['Average Passengers'] * mode_df['Unlinked Passenger Trips']).sum() / total_passenger_trips
        print(f"Year {year}: Weighted National Average Passengers for Bus = {weighted_avg_passengers:.2f}")

        # If UACE is provided, filter and calculate for specific UACE
        if uace:
            uace_df = mode_df[mode_df['UACE Code'] == str(uace)]

            if not uace_df.empty:
                total_uace_trips = uace_df['Unlinked Passenger Trips'].sum()
                weighted_uace_avg_passengers = (uace_df['Average Passengers'] * uace_df['Unlinked Passenger Trips']).sum() / total_uace_trips
                print(f"Year {year}, UACE {uace}: Weighted Average Passengers for Bus = {weighted_uace_avg_passengers:.2f}")
            else:
                print(f"Year {year}: No data for UACE {uace}")

# Define the years to process
years = [2018, 2019, 2020, 2021, 2022]

# Call the function with UACE filtering
calculate_national_and_uace_average_passengers(years, uace="23527")
