In [None]:
%pip install git+https://github.com/JGreenlee/e-mission-common.git
# %pip install geopy tqdm

# it wont work until you restart the kernel.
# so if you run into error, you may have to restart the kernel and try again.

In [None]:
import pandas as pd
import scaffolding
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import asyncio
import zipfile
import tarfile
import glob

# from geopy.geocoders import Nominatim
# from tqdm import tqdm

import matplotlib.transforms as mtrans
import matplotlib.ticker as mticker


from pprint import pprint

from plots import *


import emcommon.diary.base_modes as emcb
import emcommon.metrics.footprint.util as emcfu
import emcommon.metrics.footprint.footprint_calculations as emcfc
import emcommon.util as emcommonutil

In [None]:
plt.set_loglevel (level = 'warning')

In [None]:
# show all columns when viewing dataframes
pd.set_option('display.max_columns', None)


In [None]:
df_pur = pd.read_csv(r'auxiliary_files/purpose_labels.csv')
df_re = pd.read_csv(r'auxiliary_files/mode_labels.csv')
df_ei = pd.read_csv(r'auxiliary_files/energy_intensity.csv')

#dictionaries:
dic_pur = dict(zip(df_pur['purpose_confirm'],df_pur['bin_purpose'])) # bin purpose
dic_re  = dict(zip(df_re['replaced_mode'],df_re['mode_clean'])) # bin modes
dic_fuel = dict(zip(df_ei['mode'],df_ei['fuel']))

In [None]:

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other',dic_re)
dic_pur = defaultdict(lambda: 'Other',dic_pur)
dic_fuel = defaultdict(lambda: 'Other',dic_fuel)

mode_of_interest = "E-bike"

In [None]:
# Here begins the naive calculation.
# These functions are meant for naive

In [None]:
def no_traceback_handler(exception_type, exception, traceback):
    print("%s: %s" % (exception_type.__name__, exception), file=sys.stderr)



# CASE 2 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
unique_users = lambda df: len(df.user_id.unique()) if "user_id" in df.columns else 0
trip_label_count = lambda s, df: len(df[s].dropna()) if s in df.columns else 0


def add_energy_labels(expanded_ct, df_ei, dic_fuel):
    """ Inputs:
    expanded_ct = dataframe of trips that has had Mode_confirm and Replaced_mode added
    dic/df_* = label mappings for energy impact and fuel
    """
    expanded_ct['Mode_confirm_fuel']= expanded_ct['Mode_confirm'].map(dic_fuel)
    expanded_ct = energy_intensity(expanded_ct, df_ei, 'Mode_confirm')
    expanded_ct = energy_footprint_kWH(expanded_ct, 'distance_miles', 'Mode_confirm')
    expanded_ct = CO2_footprint_lb(expanded_ct, 'distance_miles', 'Mode_confirm')
    return expanded_ct

def add_energy_impact(expanded_ct, df_ei, dic_fuel):
    # Let's first calculate everything for the mode confirm
    # And then calculate everything for the replaced mode
    expanded_ct = add_energy_labels(expanded_ct, df_ei, dic_fuel)
    expanded_ct['Replaced_mode_fuel']= expanded_ct['Replaced_mode'].map(dic_fuel)
    expanded_ct = energy_intensity(expanded_ct, df_ei, 'Replaced_mode')
    # and then compute the impacts
    expanded_ct = energy_impact_kWH(expanded_ct, 'distance_miles')
    expanded_ct = CO2_impact_lb(expanded_ct, 'distance_miles')
    return expanded_ct

def get_quality_text(before_df, after_df, mode_of_interest=None, include_test_users=False):
    """ Inputs:
    before_df = dataframe prior to filtering (usually participant_ct_df)
    after_df = dataframe after filtering (usually expanded_ct)
    mode_of_interest = optional detail to include in the text string
    """
    # CASE 1 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
    after_pct = (len(after_df) * 100) / len(before_df) if len(before_df) != 0 else np.nan
    cq = (len(after_df), unique_users(after_df), len(before_df), unique_users(before_df),
        after_pct, )
    interest_str = mode_of_interest + ' ' if mode_of_interest is not None else ''
    total_str = 'confirmed' if mode_of_interest is not None else ''
    user_str = 'testers and participants' if include_test_users else 'users'
    quality_text = f"Based on %s confirmed {interest_str}trips from %d {user_str}\nof %s total {total_str} trips from %d users (%.2f%%)" % cq
    print(quality_text)
    return quality_text



def data_quality_check(expanded_ct):
    '''1. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was pilot_ebike.
       2. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was same_mode.
       3. Replace same_mode for the mode_confirm for Energy Impact Calcualtion.'''

    # TODO: This is only really required for the initial data collection around the minipilot
    # in subsequent deployes, we removed "same mode" and "pilot_ebike" from the options, so the
    # dataset did not contain of these data quality issues

    if 'replaced_mode' in expanded_ct.columns:
        expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'pilot_ebike')].index, inplace=True)
        expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'same_mode')].index, inplace=True)
        expanded_ct['replaced_mode'] = np.where(expanded_ct['replaced_mode'] == 'same_mode',expanded_ct['mode_confirm'], expanded_ct['replaced_mode'])
    
    return expanded_ct

def unit_conversions(df):
    df['distance_miles']= df["distance"]*0.00062 #meters to miles

def energy_intensity(trip_df,mode_intensity_df,col):
    """ Inputs:
    trip_df = dataframe with data
    mode_intensity_df = dataframe with energy/cost/time factors
    col = the column for which we want to map the intensity
    """

    mode_intensity_df = mode_intensity_df.copy()
    mode_intensity_df[col] = mode_intensity_df['mode']
    dic_ei_factor = dict(zip(mode_intensity_df[col],mode_intensity_df['energy_intensity_factor']))
    dic_CO2_factor = dict(zip(mode_intensity_df[col],mode_intensity_df['CO2_factor']))
    dic_ei_trip = dict(zip(mode_intensity_df[col],mode_intensity_df['(kWH)/trip']))

    trip_df['ei_'+col] = trip_df[col].map(dic_ei_factor)
    trip_df['CO2_'+col] = trip_df[col].map(dic_CO2_factor)
    trip_df['ei_trip_'+col] = trip_df[col].map(dic_ei_trip)
    return trip_df

def energy_footprint_kWH(df,distance_miles,col):
    """ Inputs:
    df = dataframe with data
    distance = distance in miles
    col = Replaced_mode or Mode_confirm
    """
    conditions_col = [(df[col+'_fuel'] =='gasoline'),
                       (df[col+'_fuel'] == 'diesel'),
                       (df[col+'_fuel'] == 'electric')]
    gasoline_col = (df[distance_miles]*df['ei_'+col]*0.000293071) # 1 BTU = 0.000293071 kWH
    diesel_col   = (df[distance_miles]*df['ei_'+col]*0.000293071)
    electric_col = (df[distance_miles]*df['ei_'+col])+ df['ei_trip_'+col]
    values_col = [gasoline_col,diesel_col,electric_col]
    df[col+'_EI(kWH)'] = np.select(conditions_col, values_col)
    return df

def energy_impact_kWH(df,distance_miles):
    if 'Mode_confirm_EI(kWH)' not in df.columns:
        print("Mode confirm footprint not found, computing before impact")
        df = energy_footprint_kWH(df, distance_miles, "Mode_confirm")
    df = energy_footprint_kWH(df, distance_miles, "Replaced_mode")
    df['Energy_Impact(kWH)']  = round((df['Replaced_mode_EI(kWH)'] - df['Mode_confirm_EI(kWH)']),3)
    return df

def CO2_footprint_lb(df, distance_miles, col):
    """ Inputs:
    df = dataframe with data
    distance = distance in miles
    col = Replaced_mode or Mode_confirm
    """
    conditions_col = [(df[col+'_fuel'] =='gasoline'),
                       (df[col+'_fuel'] == 'diesel'),
                       (df[col+'_fuel'] == 'electric')]
   
    gasoline_col = (df[distance_miles]*df['ei_'+col]*0.000001)* df['CO2_'+col]
    diesel_col   = (df[distance_miles]*df['ei_'+col]*0.000001)* df['CO2_'+col]
    electric_col = (((df[distance_miles]*df['ei_'+col])+df['ei_trip_'+col])*0.001)*df['CO2_'+col]

    values_col = [gasoline_col,diesel_col,electric_col]
    df[col+'_lb_CO2'] = np.select(conditions_col, values_col)
    return df
    
def CO2_impact_lb(df,distance_miles):
    if 'Mode_confirm_lb_CO2' not in df.columns:
        print("Mode confirm footprint not found, computing before impact")
        df = CO2_footprint_lb(df, distance_miles, "Mode_confirm")
    df = CO2_footprint_lb(df, distance_miles, "Replaced_mode")
    df['CO2_Impact(lb)']  = round((df['Replaced_mode_lb_CO2'] - df['Mode_confirm_lb_CO2']),3)
    return df

In [None]:
#path configuration
to_data_folder = "PaperVizualizations/Data/abby_ceo/sc" #data folder, where composite data was written from the TSDC_data file

In [None]:
# df = pd.read_csv('viz_scripts/abby_ceo/sc/analysis_confirmed_trip.csv')
# we are using smart commute data. zip with this csv taken from onedrive.
df = pd.read_csv(to_data_folder + "/analysis_confirmed_trip.csv")
# expanded_ct_2=pd.read_csv(to_data_folder + "/tsdc_filtered_merged_trips.csv")

In [None]:
print(df.columns)

In [None]:

pprint(dic_re)

In [None]:

async def calculate_naive_and_cheer(passed_df: pd.DataFrame):
    # rename columns for easier access
    passed_df.rename(columns={
        'user_id_socio': 'user_id',
        'please_identify_which_category_represents_your_total_household_': 'HHINC',
        'how_many_motor_vehicles_are_owned_leased_or_available_for_regul': 'VEH',
        ' how_many_motor_vehicles_are_owned_leased_or_available_for_regul': 'VEH',
        'how_many_motor_vehicles_are_owned_leased_or_available_for_regul ': 'VEH',
        'in_which_year_were_you_born?': 'AGE',
        'including_yourself_how_many_people_live_in_your_home?': 'HHSIZE',
        'how_many_children_under_age_18_live_in_your_home?': 'CHILDREN',
        'what_is_your_gender?': 'GENDER',
        'if_you_were_unable_to_use_your_household_vehicles_which_of_the_': 'available_modes',
        'are_you_a_student?': 'STUDENT',
        'data_duration': 'duration',
        'data_distance': 'distance'
    }, inplace=True, errors='ignore')
    
    df_mapped = passed_df.copy()

    # map modes, replaced modes, and trip purposes
    df_mapped['Mode_confirm'] = df_mapped['data_user_input_mode_confirm'].map(dic_re)
    df_mapped['Replaced_mode'] = df_mapped['data_user_input_replaced_mode'].map(dic_re)
    df_mapped['Trip_purpose'] = df_mapped['data_user_input_purpose_confirm'].map(dic_pur)
    
    # date and duration transformations
    df_mapped.rename(columns={'data_start_local_dt_year': 'year', 'data_start_local_dt_month': 'month', 'data_start_local_dt_day': 'day'}, inplace=True)
    df_mapped['date_time'] = pd.to_datetime(df_mapped[['year', 'month', 'day']])
    df_mapped['duration'] = df_mapped['duration'] / 60  # Convert to minutes
    df_mapped['distance_miles'] = df_mapped['distance'] * 0.0006213712  # Convert to miles
    df_mapped['distance_km'] = df_mapped['distance'] * 0.001  # Convert to kilometers

    # add energy impact
    expanded_ct = add_energy_impact(df_mapped, df_ei, dic_fuel) if len(df_mapped) > 0 else df_mapped

    # prepare quality text and summary stats
    data_eb = expanded_ct.query(f"Mode_confirm == '{mode_of_interest}'") if "Mode_confirm" in expanded_ct.columns else expanded_ct
    quality_text = get_quality_text(expanded_ct, data_eb, mode_of_interest)
    ebei = data_eb.groupby('Replaced_mode').agg({'Energy_Impact(kWH)': ['sum', 'mean']})
    ebei.columns = ['Sketch of Total Energy_Impact(kWH)', 'Sketch of Average Energy_Impact(kWH)']
    ebei = ebei.reset_index().sort_values(by='Sketch of Total Energy_Impact(kWH)', ascending=False)
    net_energy_saved = round(sum(ebei['Sketch of Total Energy_Impact(kWH)']), 2)

    # modify default_json to include pilot_ebike
    default_json = await emcfu.read_json_resource('label-options.default.json')
    default_json['MODE'].append({"value": "pilot_ebike", "base_mode": "E_BIKE"})

    # drop rows without a mode or marked as "Not a Trip"
    expanded_ct = expanded_ct.dropna(subset=['data_user_input_mode_confirm'])
    expanded_ct = expanded_ct[expanded_ct['Mode_confirm'] != 'Not a Trip']

    # commute data calculation helper function
    async def get_commute_data(row, labels):
        trip_object = {
            "_id": row['_id'],
            "distance": row['distance'],
            "start_fmt_time": row['data_start_fmt_time'],
            "start_loc": {"coordinates": [row['data_start_loc_longitude'], row['data_start_loc_latitude']]},
            "user_input": {
                "mode_confirm": row['data_user_input_mode_confirm'],
                "replaced_mode_confirm": row['data_user_input_replaced_mode']
            }
        }
        footprint = await emcfc.calc_footprint_for_trip(trip_object, labels)
        replaced_footprint = await emcfc.calc_footprint_for_trip(trip_object, labels, 'replaced_mode') if not pd.isna(row['data_user_input_replaced_mode']) else {}

        return {'footprint': footprint, 'replaced_footprint': replaced_footprint}

    async def get_commute_data_task(row, index, expanded_ct, labels):
        commute_data = await get_commute_data(row, labels)

        # update expanded_ct with commute data
        if 'kg_co2' in commute_data['footprint'][0] and 'kwh' in commute_data['footprint'][0]:
            expanded_ct.at[index, 'cheer_kg_co2'] = commute_data['footprint'][0]['kg_co2']
            expanded_ct.at[index, 'cheer_kwh'] = commute_data['footprint'][0]['kwh']
            if len(commute_data['replaced_footprint']) > 0:
                expanded_ct.at[index, 'cheer_replaced_kg_co2'] = commute_data['replaced_footprint'][0]['kg_co2']
                expanded_ct.at[index, 'cheer_replaced_kwh'] = commute_data['replaced_footprint'][0]['kwh']
                for uncertain_column in ['kg_co2_uncertain', 'kwh_uncertain']:
                    if uncertain_column in commute_data['replaced_footprint'][0]:
                        expanded_ct.at[index, f'cheer_replaced_{uncertain_column}'] = commute_data['replaced_footprint'][0][uncertain_column]

#         print(commute_data)
        # conditional assignment of UACE
        if row['data_user_input_mode_confirm'] == 'bus':
            if 'ntd_uace_code' in commute_data['footprint'][1]:
                expanded_ct.at[index, 'UACE'] = commute_data['footprint'][1]['ntd_uace_code']
            else:
                expanded_ct.at[index, 'UACE'] = float('nan')  # Set to NaN if UACE is not provided
        else:
            expanded_ct.at[index, 'UACE'] = float('nan')  # Set to NaN if mode is not 'bus'
                        
        expanded_ct.at[index, 'Mode_confirm_kg_CO2'] = row['Mode_confirm_lb_CO2'] * 0.453592

    async def run_tasks_concurrently():
        # this is meant to speed up the async calls of cheer
        concurrency_limit = 200
        semaphore = asyncio.Semaphore(concurrency_limit)

        async def sem_task(row, index, expanded_ct, labels):
            async with semaphore:
                await get_commute_data_task(row, index, expanded_ct, labels)

        tasks = [
            sem_task(row, index, expanded_ct, default_json)
            for index, row in expanded_ct.iterrows()
        ]
        await asyncio.gather(*tasks)

    await run_tasks_concurrently()
    return expanded_ct


In [None]:
# ensure the necessary directories exist
os.makedirs('figures/pertrip', exist_ok=True)
os.makedirs('figures/perkm', exist_ok=True)
os.makedirs('figures/cumulative', exist_ok=True)
os.makedirs('figures/naivenaive', exist_ok=True)

In [None]:
def plotter(passed_expanded_ct: pd.DataFrame, name_of_dataset: str):
    bar_width = 0.4
        
    print(name_of_dataset)
    print(list(passed_expanded_ct.columns))
    

    # filter out 'Air' and 'Other' from Mode_confirm
    expanded_ct_filtered = passed_expanded_ct[~passed_expanded_ct['Mode_confirm'].isin(['Air', 'Other'])]

    # calculate per-trip average emissions for the all-years combined plot
    grouped_data_filtered = expanded_ct_filtered.groupby('Mode_confirm')[['Mode_confirm_kg_CO2', 'cheer_kg_co2']].mean()

    # function to create and save a comparison plot for all years combined (per trip)
    def calculation_comparison_all_years(scale: str):
        fig, ax = plt.subplots(figsize=(8, 5))

        x = np.arange(len(grouped_data_filtered.index))  # x locations for the groups

        ax.bar(x - bar_width / 2, grouped_data_filtered['Mode_confirm_kg_CO2'], width=bar_width, color='lightblue', label='(Dashboard, 2020)')
        ax.bar(x + bar_width / 2, grouped_data_filtered['cheer_kg_co2'], width=bar_width, color='orange', label='CHEER')

        # title and labels
#         plt.title(f'Average CO2 Emissions per Trip for Each Mode of Transportation\n{name_of_dataset} Dataset ({scale.capitalize()} Scale)')
        plt.ylabel('Average CO2 Emissions per Trip (kg)')
        plt.xlabel('Mode of Transportation')

        # apply scale settings
        if scale == 'log':
            ax.set_yscale('log')
#             ax.set_ylim(0.01, 16)
            ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:g}'))
#         else:
#             ax.set_ylim(0, 16)

        # shift x-tick labels slightly
        ax.set_xticks(x)
        ax.set_xticklabels(grouped_data_filtered.index, rotation=45, ha='right')
        trans = mtrans.Affine2D().translate(10, 0)
        for t in ax.get_xticklabels():
            t.set_transform(t.get_transform() + trans)

        # set legend and show
        plt.legend(loc='upper center')
        plt.tight_layout()


        # save as PDF and PNG in the pertrip directory
        plt.savefig(f'figures/pertrip/{name_of_dataset.lower().replace(" ", "_")}_{scale}_per_trip.pdf')
        plt.savefig(f'figures/pertrip/{name_of_dataset.lower().replace(" ", "_")}_{scale}_per_trip.png')
        plt.show()
        plt.close()
        
    # Function to create cumulative emissions plot
    def cumulative_emissions_plot(y_naive: str, y_cheer: str, y_label: str, file_suffix: str):
        """
        Plots cumulative emissions using specified columns for Naive and CHEER calculations.
        :param y_naive: Column name for Naive emissions.
        :param y_cheer: Column name for CHEER emissions.
        :param y_label: Y-axis label for the plot.
        :param file_suffix: Suffix for file names.
        """
        # calculate cumulative emissions for each mode
        cumulative_emissions = expanded_ct_filtered.groupby('Mode_confirm')[[y_naive, y_cheer]].sum()

        # filter out modes with zero total emissions for both calculations
        cumulative_emissions = cumulative_emissions[(cumulative_emissions[y_naive] > 0) | (cumulative_emissions[y_cheer] > 0)]

        # plot cumulative emissions
        fig, ax = plt.subplots(figsize=(6, 5))
        x = np.arange(len(cumulative_emissions.index))

        # naive emissions
        ax.bar(
            x - bar_width / 2,
            cumulative_emissions[y_naive],
            width=bar_width,
            color='green',
            label='(Dashboard, 2020)',
            edgecolor='black'
        )

        # CHEER emissions
        ax.bar(
            x + bar_width / 2,
            cumulative_emissions[y_cheer],
            width=bar_width,
            color='purple',
            label='CHEER',
            edgecolor='black'
        )

        ax.set_ylabel(y_label)
        ax.set_xlabel('Mode of Transportation')
        ax.set_xticks(x)
        ax.set_xticklabels(cumulative_emissions.index, rotation=45, ha='right')
        
        
        plt.legend(loc='upper left')

        plt.tight_layout()
        plt.savefig(f'figures/cumulative/{name_of_dataset.lower().replace(" ", "_")}_{file_suffix}_cumulative_emissions.png')
        plt.savefig(f'figures/cumulative/{name_of_dataset.lower().replace(" ", "_")}_{file_suffix}_cumulative_emissions.pdf')
        plt.show()
        plt.close()



    # function for yearly comparison with average per km
    def calculation_comparison_for_each_year():
        years = expanded_ct_filtered['year'].unique()
        num_years = len(years)
        fig, axes = plt.subplots(1, num_years, figsize=(6 * num_years, 5), sharey=True)

        # ensure axes is iterable
        if num_years == 1:
            axes = [axes]

        for i, year in enumerate(sorted(years)):
            year_data = expanded_ct_filtered[expanded_ct_filtered['year'] == year]

            # calculate per-km average emissions
            year_data['CO2_per_km'] = year_data['Mode_confirm_kg_CO2'] / year_data['distance_km']
            year_data['cheer_CO2_per_km'] = year_data['cheer_kg_co2'] / year_data['distance_km']
            grouped_year_data = year_data.groupby('Mode_confirm')[['CO2_per_km', 'cheer_CO2_per_km']].mean()

            x = np.arange(len(grouped_year_data.index))

            axes[i].bar(x - bar_width / 2, grouped_year_data['CO2_per_km'], width=bar_width, color='lightblue', label='(Dashboard, 2020)')
            axes[i].bar(x + bar_width / 2, grouped_year_data['cheer_CO2_per_km'], width=bar_width, color='orange', label='CHEER')

            # titles and labels for each subplot
            axes[i].set_title(f'Year {int(year)}\n{name_of_dataset} Dataset')
            axes[i].set_ylabel('Average CO2 Emissions per km (kg)' if i == 0 else "")
            axes[i].set_xlabel('Mode of Transportation')
            axes[i].set_ylim(0, 0.65)
            axes[i].legend(loc='upper right')

            # shift x-tick labels
            axes[i].set_xticks(x)
            axes[i].set_xticklabels(grouped_year_data.index, rotation=45, ha='right')
            trans = mtrans.Affine2D().translate(10, 0)
            for t in axes[i].get_xticklabels():
                t.set_transform(t.get_transform() + trans)

        plt.tight_layout()
        plt.savefig(f'figures/perkm/{name_of_dataset.lower().replace(" ", "_")}_yearly_comparison_per_km.pdf')
        plt.savefig(f'figures/perkm/{name_of_dataset.lower().replace(" ", "_")}_yearly_comparison_per_km.png')
        plt.show()
        plt.close()

    def generate_latex_table():
        years = sorted(expanded_ct_filtered['year'].unique())

        # dataframe for average CHEER CO2 emissions per km with modes as rows and years as columns
        modes = expanded_ct_filtered['Mode_confirm'].unique()
        average_co2_df = pd.DataFrame(index=modes, columns=years)

        # calculate average CO2 per km for CHEER only, for each mode and year
        for year in years:
            year_data = expanded_ct_filtered[expanded_ct_filtered['year'] == year]

            # calculate `cheer_kg_co2` per km
            year_data['cheer_CO2_per_km'] = year_data['cheer_kg_co2'] / year_data['distance_km']
            avg_cheer_co2_per_km = year_data.groupby('Mode_confirm')['cheer_CO2_per_km'].mean()

            # assign the calculated averages to the DataFrame for each year
            average_co2_df[year] = avg_cheer_co2_per_km

        # calculate percentage change between the first and last year and add as a new column
        if len(years) > 1:
            percentage_change = ((average_co2_df[years[-1]] - average_co2_df[years[0]]) / average_co2_df[years[0]]) * 100
            average_co2_df['% Change'] = percentage_change

        # drop rows where all values in the relevant columns (years and % Change) are zero or NaN
        #  explicitly drop specific modes like "Walk", "Regular Bike", and "Bikeshare"
        rows_to_drop = ["Walk", "Regular Bike", "Bikeshare"]
        average_co2_df = average_co2_df.drop(rows_to_drop, errors='ignore')  # Ignore errors if the row isn't there
        average_co2_df = average_co2_df[(average_co2_df != 0).any(axis=1)]  # Drop rows with all zeros or NaN

        # sort the DataFrame by index (modes) alphabetically
        average_co2_df = average_co2_df.sort_index()

        
        #  full table environment for paper
        latex_code = r"\begin{table}[h!]" + "\n" \
                     r"\centering" + "\n" \
                     r"\caption{Average CHEER CO2 Emissions per km by Mode and Year for CanBikeCO.}" + "\n" \
                     r"\label{tab:avg_canbikeco_cheer}" + "\n" \
                     + average_co2_df.to_latex(index=True, header=True, na_rep="--", float_format="%.4f", column_format="l" + "r" * (len(years) + 1)) + "\n" \
                     + r"\end{table}"

        print(latex_code)


    def calculate_percentage_difference_table():
        # calculate cumulative CO2 emissions for each mode
        cumulative_emissions = expanded_ct_filtered.groupby('Mode_confirm')[['Mode_confirm_kg_CO2', 'cheer_kg_co2']].sum()

        # calculate percentage difference between naive and CHEER
        cumulative_emissions['percentage_diff'] = ((cumulative_emissions['cheer_kg_co2'] - cumulative_emissions['Mode_confirm_kg_CO2']) / cumulative_emissions['Mode_confirm_kg_CO2']) * 100

        # reset index and filter out rows where Mode_confirm is "Walk", "Regular Bike", "Bikeshare", or "Skateboard"
        cumulative_emissions.reset_index(inplace=True)
        cumulative_emissions = cumulative_emissions[~cumulative_emissions['Mode_confirm'].isin(['Walk', 'Regular Bike', 'Bikeshare', 'Skate board'])]

        # create a dictionary from the filtered df
        percentage_diff_dict = cumulative_emissions.set_index('Mode_confirm')['percentage_diff'].to_dict()

        # return the dictionary in a format suitable for the main dictionary structure
        return {name_of_dataset: percentage_diff_dict}



    # create per-trip average plot for all years
    calculation_comparison_all_years(scale='normal')
    calculation_comparison_all_years(scale='log')

    # call the function for both kg and kWh plots
    cumulative_emissions_plot(
        y_naive='Mode_confirm_kg_CO2',
        y_cheer='cheer_kg_co2',
        y_label='Cumulative CO2 Emissions (kg)',
        file_suffix='kg'
    )

    cumulative_emissions_plot(
        y_naive='Mode_confirm_EI(kWH)',
        y_cheer='cheer_kwh',
        y_label='Cumulative Energy Impact (kWh)',
        file_suffix='kwh'
    )

    # create per-km average plot for each year
    calculation_comparison_for_each_year()

    # generate table for percentage change in CO2
    generate_latex_table()
    
    # generate  table for percentage difference in cumulative emissions between Naive and CHEER
    print('lastly, naive and cheer percentage')

    
    return calculate_percentage_difference_table()


In [None]:
os.makedirs('figures', exist_ok=True)

In [None]:

# the purpose for this function is because calling cheer functions
# takes a long time
# this could even be made faster if we used shapefiles instead
# of http calls in CHEER. but we would have to made code changes to e mission common for that.
async def load_or_fetch_data(dataset_name: str, original_df: pd.DataFrame) -> pd.DataFrame:
    """
    Loads a dataset from a CSV file if it exists, otherwise fetches it asynchronously, 
    processes it, and saves it with an updated filename.
    
    Parameters:
        dataset_name (str): The base name of the dataset (used in the filename).
        original_df (pd.DataFrame): The original DataFrame to process if the CSV does not exist.
    
    Returns:
        pd.DataFrame: The loaded or newly processed dataset.
    """
    filename = f"{dataset_name}-with-emissions.csv"
    
    if not os.path.exists(filename):
        # fetch data asynchronously and process it
        data = await calculate_naive_and_cheer(original_df)
        
        # save to CSV file
        data.to_csv(filename, index=False)
    else:
        # load data from CSV file
        data = pd.read_csv(filename)
    
    return data


smartcommute = await load_or_fetch_data("smart_commute", df)

In [None]:
sc_difference = plotter(smartcommute, "Smart Commute")

In [None]:
print(sc_difference)

In [None]:
# Lookup dictionary for city/program names and folder codes
lookupdictionary = {
    "sc": "sc_21",
    "boulder": "cc_21",
    "fortcollins": "fc_21",
    "pueblo": "pc_21",
    "durango": "4c_21",
    "vail": "vail_22",
}

# list to store datasets
datasets = []

for biggername, program_code in lookupdictionary.items():
    print(f'\nStarting with {biggername} ({program_code})')
    
    
    # this is custom made for the data that was given to us by tsdc.
    trip_file = f"CanBikeCO_2/ceo_{program_code}/analysis_confirmed_trip.csv"

    
    trips = pd.read_csv(trip_file, engine="python", on_bad_lines='skip')
    trips.reset_index() # this is really important for canbikeco or else data got mixed togehter
    print(len(trips), 'trips')
    print(trips.perno.nunique(), 'people')
    

    # add program code to the dataset
    trips['program'] = program_code.upper()
    
    # append merged data to datasets list
    datasets.append(trips)

# concatenate all datasets into single df
full_data = pd.concat(datasets, ignore_index=True)
print(len(full_data), 'trips')
print(full_data.perno.nunique(), 'unique users')

In [None]:
#
#
# TEMPORARY. REMOVE
#
# full_data = full_data[full_data['data_user_input_mode_confirm'] == 'walk'].head(55)
# full_data = full_data.head(20000)
# full_data

In [None]:
print(full_data['data_user_input_mode_confirm'].unique())

In [None]:
canbikeco = await load_or_fetch_data("canbikeco",
                                     full_data)

In [None]:
print(len(canbikeco['data_start_loc_longitude'].unique()))
canbikeco[canbikeco['data_user_input_mode_confirm'] == 'bus'].head(55)

In [None]:
buses = canbikeco[canbikeco['data_user_input_mode_confirm'] == 'bus']

# calculate the frequency and percentage of each unique value in the UACE column, including NaN
uace_counts = buses['UACE'].value_counts(dropna=False)
uace_percentages = (uace_counts / len(buses)) * 100  # Use the correct length of 'buses' for percentage calculation

# combine the counts and percentages into one df
uace_summary = pd.DataFrame({
    'Count': uace_counts,
    'Percentage': uace_percentages
})


print(uace_summary)


In [None]:
# define the year and calculate the census year
year = 2020
census_year = year - (year % 10)  # round down to nearest decade

# filter rows where UACE is NaN
buses_with_nan_uace = buses[buses['UACE'].isna()]

# iterate through the rows and print the formatted URL
for index, row in buses_with_nan_uace.iterrows():
    coords = [row['data_start_loc_longitude'], row['data_start_loc_latitude']]
    url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates?" + \
        f"x={coords[0]}&y={coords[1]}" + \
        f"&benchmark=Public_AR_Current&vintage=Census{census_year}_Current&layers=87&format=json"
    print(url)


In [None]:
# calculate the average 'Mode_confirm_kg_CO2' where 'UACE' equals 23527, which is denver area code
average_mode_confirm_kg_co2_uace_23527 = buses[buses['UACE'] == 23527]['Mode_confirm_kg_CO2'].mean()
average_all_naive = buses['Mode_confirm_kg_CO2'].mean()

print("Average Mode_confirm_kg_CO2 for UACE 23527:", average_mode_confirm_kg_co2_uace_23527)
print(average_all_naive)

In [None]:

# import requests

# def get_state(lat, lon):
#     try:
#         url = f"https://nominatim.openstreetmap.org/reverse?lat={lat}&lon={lon}&format=json&addressdetails=1"
#         response = requests.get(url, verify=False)  # Disable SSL verification
#         data = response.json()
#         if "address" in data and "state" in data["address"]:
#             print(data["address"]["state"])
#             return data["address"]["state"]
#         return "Unknown"
#     except Exception as e:
#         print(f"Error for coordinates ({lat}, {lon}): {e}")
#         return "Unknown"

# # Apply to the dataframe
# buses['State'] = buses.progress_apply(
#     lambda row: get_state(row['data_start_loc_latitude'], row['data_start_loc_longitude']), axis=1
# )

# # Count occurrences of each state
# state_counts = buses['State'].value_counts()



In [None]:
# # Create a bar chart
# plt.figure(figsize=(12, 6))
# state_counts.plot(kind='bar', color='skyblue')
# plt.title('Number of Trips by State')
# plt.xlabel('State')
# plt.ylabel('Number of Trips')
# plt.yscale('log')
# plt.xticks(rotation=45)
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.tight_layout()
# plt.show()

In [None]:
# calculate  mean of cheer_kg_co2
mean_cheer_kg_co2 = buses['cheer_kg_co2'].mean()

# replace NaN in UACE with a string label "Unknown" for plotting
buses['UACE_filled'] = buses['UACE'].fillna('Unknown')

# convert UACE_filled to a numerical category for plotting
buses['UACE_numeric'] = buses['UACE_filled'].astype('category').cat.codes

# scatter plot of UACE vs cheer_kg_co2
plt.figure(figsize=(10, 6))
plt.scatter(buses['UACE_numeric'], buses['cheer_kg_co2'], alpha=0.5)

# horizontal line for the mean cheer_kg_co2
plt.axhline(mean_cheer_kg_co2, color='red', linestyle='--', linewidth=1, label=f'Mean cheer_kg_co2 = {mean_cheer_kg_co2:.2f}')

plt.title('Scatter Plot of UACE vs cheer_kg_co2')
plt.xlabel('UACE (as Numeric)')
plt.ylabel('cheer_kg_co2')
plt.grid(True)
plt.legend()

uace_unique = buses['UACE_filled'].unique()
plt.xticks(
    ticks=range(len(uace_unique)), 
    labels=uace_unique,
    rotation=45
)

plt.tight_layout()
plt.show()


In [None]:
from matplotlib.ticker import FuncFormatter


# function to format the y-axis with commas
def format_y_axis(value, _):
    return f"{int(value):,}"

# histogram of cheer_kg_co2
plt.figure(figsize=(10, 6))
plt.hist(buses['cheer_kg_co2'], bins=30, alpha=0.7, color='blue', edgecolor='black')

plt.title('Histogram of CHEER in bus trips for CanBikeCO')
plt.xlabel('cheer_kg_co2')
plt.ylabel('Frequency')
plt.yscale('log')
plt.grid(axis='y')

# commas needed for readability
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_y_axis))

plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
plt.hist(buses['Mode_confirm_kg_CO2'], bins=30, alpha=0.7, color='green', edgecolor='black')

plt.title('Histogram of Dashboard 2020 in bus trips for CanBikeCO')
plt.xlabel('Mode_confirm_kg_CO2')
plt.ylabel('Frequency')
plt.yscale('log')
plt.grid(axis='y')

plt.gca().yaxis.set_major_formatter(FuncFormatter(format_y_axis))

plt.tight_layout()
plt.show()


In [None]:
# which UACE correspond to higher CO2 emissions?
# make a scatterplot to find out.

# consider distance

plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    buses['UACE_numeric'], 
    buses['cheer_kg_co2'], 
    c=buses['distance'],  # Color represents data_distance
    cmap='cool',            # Choose a colormap
    alpha=0.7
)

# color represents data_distance
cbar = plt.colorbar(scatter)
cbar.set_label('distance')

uace_unique = buses['UACE_filled'].unique()
plt.xticks(
    ticks=range(len(uace_unique)),
    labels=uace_unique,
    rotation=45
)

plt.title('UACE vs cheer_kg_co2 with data_distance as Color')
plt.xlabel('UACE')
plt.ylabel('cheer_kg_co2')
plt.tight_layout()
plt.show()


In [None]:

bus_data = canbikeco[canbikeco['Mode_confirm'] == 'Bus']
average_cheer_kg_co2_bus = bus_data['cheer_kg_co2'].mean()
avg_naive = bus_data['Mode_confirm_kg_CO2'].mean()
avg_distance = bus_data['distance'].mean()

# print(list(canbikeco.columns))
testerrr = canbikeco[canbikeco['data_user_input_mode_confirm'] == 'bus']
print(testerrr['cheer_kg_co2'].mean())
print('?')

print(f"The average CHEER CO₂ emissions for 'Bus' is: {average_cheer_kg_co2_bus} kg")
print(f"Naive avg co2 emissions for bus", avg_naive)
print('distance', avg_distance)

In [None]:
# debug cheer
# for some reason, walk was showing as having emissions. this was because we werent resetting index
# this is fixed now but this stays just for reference

In [None]:
whatsthis = canbikeco[(canbikeco['Mode_confirm'] == 'Walk') & (canbikeco['cheer_kg_co2'] > 0)]
# drop specified columns from the DataFrame before selecting the first row
whatsthis.head(1)
# whatsthis_filtered = whatsthis.drop(columns=['cheer_kg_co2', 'cheer_kwh', 'cheer_replaced_kg_co2', 'cheer_replaced_kwh'])

# # Select the first row and convert it to a dictionary
# first_row_dict = whatsthis_filtered.iloc[0].to_dict()

# # Print the dictionary
# # print(first_row_dict)


In [None]:
# goodone = canbikeco[(canbikeco['Mode_confirm'] == 'Walk') & (canbikeco['cheer_kg_co2'] == 0)]
# print(goodone.head(1))

In [None]:

# combined_df = pd.concat([whatsthis_filtered.iloc[0], goodone.head(1)], ignore_index=True)
# combined_df

In [None]:
co_difference = plotter(canbikeco, "CanBikeCO")

In [None]:
print(co_difference)

In [None]:
a_test = canbikeco[canbikeco['Mode_confirm'] == 'Walk'].head(2)
a_test
# a_test = pd.DataFrame(a_test
#                      )
# row = a_test.iloc[0]
# print(a_test.to_string())

In [None]:
# print(replaced_footprint)


In [None]:
# Bull Durham

In [None]:
if not os.path.isdir('bull'):
    if not os.path.isfile('tsdc-2022-bull-e-bike-pilot-program-study-full-survey-data 2.zip'):
        raise ValueError("i need the zip, ask TSDC")
    else:
        # Unzip the file to the 'bull' directory
        with zipfile.ZipFile('tsdc-2022-bull-e-bike-pilot-program-study-full-survey-data 2.zip', 'r') as zip_ref:
            zip_ref.extractall('bull')
        print("Unzipped to 'bull' directory.")

In [None]:
bull = pd.read_csv('bull/tsdc-2022-bull-e-bike-pilot-program-study-full-survey-data 2/data/analysis_confirmed_trip.csv')

In [None]:
bull_loaded = await load_or_fetch_data("bull-durham",
                                     bull)

In [None]:
bull_difference = plotter(bull_loaded, "Bull (Durham, NC) eBike")

In [None]:
# MassCEC

In [None]:
# check if 'MassCEC' directory exists
if not os.path.isdir('MassCEC'):
    # check if the tar.gz file exists
    if not os.path.isfile('mass_jacques.tar.gz'):
        raise ValueError("I need the tar.gz file, ask TSDC")
    else:
        # extract the tar.gz file to the 'MassCEC' directory
        with tarfile.open('mass_jacques.tar.gz', 'r:gz') as tar_ref:
            tar_ref.extractall('MassCEC')
        print("Extracted to 'MassCEC' directory.")

In [None]:
mass = pd.read_csv('MassCEC/mass_jacques/analysis_confirmed_trip.csv')
# print(mass.columns)
print(mass['data_user_input_mode_confirm'].unique())

In [None]:
# mass[mass['data_user_input_mode_confirm'] == 'bus'].head(55)
print(len(mass))

In [None]:
mass_loaded = await load_or_fetch_data("masscec",
                                       mass)


In [None]:
# mass_loaded[mass_loaded['Mode_confirm'] == 'bus'].head(55)
print(len(mass_loaded))

In [None]:
print(mass_loaded['Mode_confirm'].unique())

In [None]:
mass_difference = plotter(mass_loaded, "MassCEC")

In [None]:
print(list(mass_loaded.columns))

In [None]:
# expanded_ct.head(30)

In [None]:
# combine dictionaries
differences = [co_difference, bull_difference, mass_difference]

# map dataset names to acronyms
dataset_map = {
    'CanBikeCO': 'CO',
    'Bull (Durham, NC) eBike': 'NC',
    'MassCEC': 'MA'
}

# extract dataset names and modes
dataset_names = [dataset_map[list(d.keys())[0]] for d in differences]
modes = sorted(set().union(*[d[list(d.keys())[0]].keys() for d in differences]))

# create a dataframe with each dataset's percentage differences, replacing None or NaN with "--"
table_data = {
    dataset_map[name]: [
        fr"{float(d[name].get(mode)):.2f}\%" if d[name].get(mode) not in ["NaN", None] else "--"
        for mode in modes
    ]
    for d in differences
    for name in d
}
table_data['Mode'] = modes

# convert to dataframe and set 'Mode' as the first column
df_table = pd.DataFrame(table_data)
df_table = df_table[['Mode'] + dataset_names]

# convert to latex, now `--` will appear instead of `NaN`
latex_code = df_table.to_latex(
    index=False,
    header=True,
    column_format="|l|c|c|c|",
    escape=False,
    caption="Percentage differences in cumulative CO2 emissions by mode for each dataset between naive and CHEER, for the average trip taken of that mode.",
    label="tab:naive_cheer_percent_diff"  # Specify the label here
)
print(latex_code)



In [None]:
# Naive Squared

In [None]:
from emission.core.wrapper.modeprediction import PredictedModeTypes
import ast


def section_reader_naive_naive(dataframe: pd.DataFrame,
                               dataset_name: str,
                               full_dataframe: pd.DataFrame):

    # determine the correct column name to use for trip IDs
    trip_id_column = 'data_trip_id' if 'data_trip_id' in dataframe.columns \
        else 'tripno' if 'tripno' in dataframe.columns \
        else 'data_cleaned_trip'

    # filter dataframe to include only rows where the trip IDs are in full_dataframe
    filtered_dataframe = dataframe[dataframe[trip_id_column].isin(full_dataframe['data_cleaned_trip'])]

    print(f"Number of sections before filtering: {len(dataframe)}")
    print(f"Number of sections after filtering: {len(filtered_dataframe)}")


    dataframe = filtered_dataframe.copy()
    # function to map the numeric mode to its string representation
    def map_mode_to_string(mode_number):
        try:
            return PredictedModeTypes(mode_number).name.title()
        except ValueError:
            return "Unknown"  # unexpected numbers

    # apply the function to create a new column with mode type names
    dataframe['predicted_mode_name'] = dataframe['data_sensed_mode'].apply(map_mode_to_string)
    dataframe['distance_km'] = dataframe['data_distance'] * 0.001

    # helper function to calculate the average speed from data_speeds
    def calculate_average_speed(speeds):
        if isinstance(speeds, str):
            # parse the string to a list if in string format
            speeds = ast.literal_eval(speeds)
        if isinstance(speeds, list) and speeds:
            return sum(float(speed) for speed in speeds) / len(speeds)
        return 0  # return 0 if speeds is empty or not a list

    # apply the helper function to calculate the average speed for each row
    dataframe['average_speed'] = dataframe['data_speeds'].apply(calculate_average_speed)

    # calculate the average speed for each mode
    average_speed_per_mode = dataframe.groupby('predicted_mode_name')['average_speed'].mean().reset_index()
    average_speed_per_mode.columns = ['predicted_mode_name', 'average_speed']

    # display the result
    print("Average speed per mode:")
    print(average_speed_per_mode)

    g_pkm = {
        'Car': 172.78,  # ICEV
        'Train': 57.17,
        'Subway': 57.17,  # treat Subway as Train
        'Bus': 165.94,
        'Air_Or_Hsr': 134.86,  # is it ok to group HSR in too?
        'Walking': 0,
        'Bicycling': 0,
    }

    # map the CO2 emissions per mode (convert grams to kg by dividing by 1000)
    def calculate_co2_emission(row):
        mode = row['predicted_mode_name']
        distance_km = row['distance_km']
        co2_per_km_kg = g_pkm.get(mode, 0) / 1000  # default to 0 if mode is not in g_pkm
        return co2_per_km_kg * distance_km

    # apply the function to calculate the CO2 emissions for each row
    dataframe['co2_emission_kg_naive_naive'] = dataframe.apply(calculate_co2_emission, axis=1)

    # verify that 'co2_emission_kg_naive_naive' column exists and contains data
    print(dataframe[['predicted_mode_name', 'distance_km', 'co2_emission_kg_naive_naive']].head())

    # calculate cumulative CO2 emissions for each mode
    cumulative_co2_emission = dataframe.groupby('predicted_mode_name')['co2_emission_kg_naive_naive'].sum().reset_index()

    # rename the column to reflect cumulative emissions
    cumulative_co2_emission.columns = ['predicted_mode_name', 'cumulative_co2_emission_kg_naive_naive']

    # ensure all modes, including "Walking" and "Bicycling", are represented
    for mode in ['Walking', 'Bicycling']:
        if mode not in cumulative_co2_emission['predicted_mode_name'].values:
            cumulative_co2_emission = pd.concat([
                cumulative_co2_emission,
                pd.DataFrame({'predicted_mode_name': [mode], 'cumulative_co2_emission_kg_naive_naive': [0]})
            ], ignore_index=True)

    # exclude specific modes
    cumulative_co2_emission = cumulative_co2_emission[~cumulative_co2_emission['predicted_mode_name'].isin(['Tram', 'Light_Rail', 'Unknown'])]

    #
    # sensed mode to data user input confusion matrix
    #
        #  temporary merged DataFrame for the confusion matrix

    temp_joined_df = pd.merge(
        dataframe,
        full_dataframe[['data_cleaned_trip', 'data_user_input_mode_confirm']],
        left_on=trip_id_column,
        right_on='data_cleaned_trip',
        how='left'
    )
    
    comparison_table = pd.crosstab(
        temp_joined_df['data_user_input_mode_confirm'], 
        temp_joined_df['predicted_mode_name'], 
        rownames=['User Input Mode'], 
        colnames=['Predicted Mode'],
        normalize='index'  # Normalize by rows to show percentages
    ) * 100

    # output the raw comparison matrix and normalized version
    raw_comparison_table = pd.crosstab(
        temp_joined_df['data_user_input_mode_confirm'], 
        temp_joined_df['predicted_mode_name'], 
        rownames=['User Input Mode'], 
        colnames=['Predicted Mode']
    )

    print("Raw Comparison Table:")
    print(raw_comparison_table)

    print("\nNormalized Comparison Table (%):")
    print(comparison_table)
    
    #
    #
    
    
    # filter out modes with zero cumulative emissions
    cumulative_co2_emission_filtered = cumulative_co2_emission[
        cumulative_co2_emission['cumulative_co2_emission_kg_naive_naive'] > 0
    ]
    # calculate cumulative distance for each mode
    cumulative_distance = dataframe.groupby('predicted_mode_name')['distance_km'].sum().reset_index()

    # rename the column to reflect cumulative distance
    cumulative_distance.columns = ['predicted_mode_name', 'cumulative_distance_km']

    # filter out modes with absolutely zero distance and exclude Tram and Unknown
    cumulative_distance_filtered = cumulative_distance[
        (cumulative_distance['cumulative_distance_km'] > 0) &
        (~cumulative_distance['predicted_mode_name'].isin(['Tram', 'Unknown']))
    ]


    plt.figure(figsize=(10, 7))
    plt.bar(
        cumulative_distance_filtered['predicted_mode_name'],
        cumulative_distance_filtered['cumulative_distance_km']
    )

    plt.xlabel('Mode of Transport', fontsize=22)
    plt.ylabel('Cumulative Distance (km)', fontsize=22)

    plt.xticks(fontsize=20, rotation=45)
    plt.yticks(fontsize=20)


#     plt.yscale('log')

    plt.ylim(0, 140000)

    # remove scientific notation on the y-axis
    ax = plt.gca()
    ax.yaxis.set_major_formatter(mticker.ScalarFormatter())
    ax.yaxis.get_major_formatter().set_scientific(False)
    ax.yaxis.get_major_formatter().set_useOffset(False)

    plt.tight_layout()
    plt.savefig(f'figures/naivenaive/{dataset_name.lower().replace(" ", "_")}_cumulative_distance_log.pdf')
    plt.show()
    plt.close()

    return dataframe



In [None]:
if not os.path.isdir('inferred'):
    if not os.path.isfile('inferred_sections.zip'):
        raise ValueError("i need the zip with sections, ask TSDC")
    else:
        # Unzip the file to the 'bull' directory
        with zipfile.ZipFile('inferred_sections.zip', 'r') as zip_ref:
            zip_ref.extractall('inferred')
        print("Unzipped to 'inferred' directory.")

In [None]:
# path configuration
data_dir = "all_CanBikeCO"
# dictionary to store DataFrames with keys as dataset names
inferred_sections = {}


files_to_combine = [
    "inf_sec_pc.csv",
    "inf_sec_vail.csv",
    "inf_sec_4c.csv",
    "inf_sec_cc.csv",
    "inf_sec_sc.csv",
    "inf_sec_fc.csv",
]

inferred_canbike = pd.DataFrame()


for file_name in files_to_combine:
    file_path = os.path.join(data_dir, file_name)
    dataset_key = file_name.split('_')[-1].replace('.csv', '')  # Extract dataset key
    print('\nProcessing file:', file_path)
    
    # read the CSV file
    df = pd.read_csv(file_path, engine="python", on_bad_lines='skip')
    print(f'{len(df)} rows read from {file_name}')
    
    
    # store df in the dictionary
    inferred_sections[dataset_key] = df
    
    # reset index to ensure uniqueness
    df.reset_index()

    inferred_canbike = pd.concat([inferred_canbike, df], ignore_index=True)

print(f'\nTotal number of rows after merging: {len(inferred_canbike)}')

In [None]:
section_reader_naive_naive(inferred_canbike, "CanBikeCO", canbikeco)

In [None]:
bull_section = pd.read_csv('inferred/inferred_sections/analysis_inferred_section_DURHAM.csv')

In [None]:
bull_section = section_reader_naive_naive(bull_section, "Bull eBike in Durham, NC", bull_loaded)

In [None]:
mass_section = pd.read_csv('inferred/inferred_sections/analysis_inferred_section_MASS.csv')

In [None]:
mass_section = section_reader_naive_naive(mass_section, "MassCEC", mass_loaded)

In [None]:
# mass_loaded and mass_section
mass_loaded.head()

In [None]:
def three_method_comparator(section_df: pd.DataFrame, full_df: pd.DataFrame, dataname: str, cumulative_co2_dict=None):
    print(' section before:', len(section_df))

    # determine the correct column name to use for trip IDs
    print(section_df.columns)
    print(':)')
    trip_id_column = 'data_trip_id' if 'data_trip_id' in section_df.columns \
        else 'tripno' if 'tripno' in section_df.columns \
        else 'data_cleaned_trip'
    filtered_sections_cleaned = section_df[section_df[trip_id_column].isin(full_df['data_cleaned_trip'])]

    # merge with the full_df to get the necessary mode and CO2 data
    filtered_sections_with_modes = filtered_sections_cleaned.merge(
        full_df[['data_cleaned_trip', 'Mode_confirm_kg_CO2', 'Mode_confirm']],
        left_on=trip_id_column,
        right_on='data_cleaned_trip',
        how='left'
    )
    print('number of sections matched:', len(filtered_sections_cleaned))
    
    print('distance sections', f"{filtered_sections_with_modes['data_distance'].sum():,}")
    print('distance full', f"{full_df['distance'].sum():,}")

    # check if cumulative CO2 dictionary is provided
    if cumulative_co2_dict:
        # use provided emission values
        naive_naive_emissions = cumulative_co2_dict.get('naivenaive', 0)
        naive_emissions = cumulative_co2_dict.get('naive', 0)
        total_cheer_emissions = cumulative_co2_dict.get('cheer', 0)
    else:
        # calculate emissions for each method
        naive_naive_emissions = filtered_sections_with_modes['co2_emission_kg_naive_naive'].sum()
        naive_emissions = full_df['Mode_confirm_kg_CO2'].sum()
        total_cheer_emissions = full_df['cheer_kg_co2'].sum()

    print(f"\nApp 2014: {naive_naive_emissions} kg")
    print(f"Dashboard 2020: {naive_emissions} kg")
    print(f"CHEER: {total_cheer_emissions} kg")
    
    # Step 1: get unique trip IDs in full_df based on data_cleaned_trip
    all_trip_ids = set(full_df["data_cleaned_trip"].unique())

    # Step 2: get unique trip IDs in current_section that match with full_df based on tripno
    matched_trip_ids = set(filtered_sections_with_modes[filtered_sections_with_modes[trip_id_column].isin(all_trip_ids)][trip_id_column].unique())

    # Step 3: find trips in full_df without sections
    trips_without_sections = all_trip_ids - matched_trip_ids

    # count the number of trips without sections
    num_trips_without_sections = len(trips_without_sections)

    # output the statistics
    print(f"Total trips in full_df: {len(all_trip_ids)}")
    print(f"Trips in full_df without any sections: {num_trips_without_sections}")

    #data for plotting
    methods = ["App 2014", "Dashboard 2020", "CHEER"]
    emissions = [naive_naive_emissions, naive_emissions, total_cheer_emissions]
    colors = ['#FFA07A', '#6495ED', '#3CB371']  # custom colors for each method

    plt.figure(figsize=(6, 5))
    plt.bar(methods, emissions, color=colors)
    plt.xlabel("Emission Calculation Method")
    plt.ylabel("Total CO₂ Emissions (kg)")
#     plt.title(f"Comparison of CO₂ Emissions by Calculation Method\n{dataname} Dataset")
    
    # format y-axis tick labels with commas
    ax = plt.gca()
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
    plt.tight_layout()

    plt.savefig(f"{dataname.replace('(', '').replace(')', '').replace(' ', '_').replace(',', '').lower()}_three_emissions_comparison.pdf", dpi=300, bbox_inches='tight')
    
    plt.show()
    
    return {
        'naivenaive': naive_naive_emissions,
        'naive': naive_emissions,
        'cheer': total_cheer_emissions,
    }


In [None]:
three_method_comparator(mass_section, mass_loaded, "MassCEC")

In [None]:
three_method_comparator(bull_section, bull_loaded, "Bull (Durham, NC) eBike")

In [None]:
# # print(f"{full_data['data_distance'].sum():,}")
# three_method_comparator(inferred_canbike, canbikeco, "CanBikeCO")


In [None]:
# print(inferred_sections.keys())

In [None]:
# old file mapping
lookupdictionary = {
    "sc": "sc_21",
    "boulder": "cc_21",
    "fortcollins": "fc_21",
    "pueblo": "pc_21",
    "durango": "4c_21",
    "vail": "vail_22",
}

# # Read and concatenate all datasets in a single line
# inferred_co_data = pd.concat(
#     (
#         pd.read_csv(f"all_CanBikeCO/inf_sec_{program_code}.csv", engine="python", on_bad_lines='skip')
#         .assign(program=program_code.upper())
#         for program_code in lookupdictionary.values()
#     ),
#     ignore_index=True
# )

# # Output the result for verification
# print(f"Total rows: {len(inferred_co_data)}")
# print(f"Unique users: {inferred_co_data['perno'].nunique()}")

In [None]:
# # Read and concatenate all datasets in a single line
# full_trip_co = pd.concat(
#     (
#         pd.read_csv(f"all_CanBikeCO/conf_trip_{program_code}.csv", engine="python", on_bad_lines='skip')
#         .assign(program=program_code.upper())
#         for program_code in lookupdictionary.values()
#     ),
#     ignore_index=True
# )

# # Output the result for verification
# print(f"Total trips: {len(full_trip_co)}")
# print(f"Unique users: {full_trip_co['perno'].nunique()}")

In [None]:
# print("Before processing:")
# print(f"Number of trips: {len(full_trip_co)}")
# print(f"Total distance: {full_trip_co['data_distance'].sum():,} m")

# # print("Columns in full_trip_co:")
# # print(full_trip_co.columns)

# num_missing_mode_confirm = full_trip_co['data_user_input_mode_confirm'].isna().sum()
# print(f"Number of trips with missing 'data_user_input_mode_confirm': {num_missing_mode_confirm}")


In [None]:
# full_trip_co


In [None]:


# canbikeco = await load_or_fetch_data("canbikeco", full_trip_co)

# print("After processing:")
# print(f"Number of trips: {len(canbikeco)}")
# print(f"Total distance: {canbikeco['distance'].sum()}")


In [None]:
# canbike_new_full = await load_or_fetch_data("co-full-new", full_trip_co) # this function is the issue

In [None]:
# three_method_comparator(inferred_canbike, canbike_new_full, "CanBikeCO") 

In [None]:
# # Update the keys to be uppercase and adjust 'VAIL_22' to 'VAIL'
# confirmed = {key.upper().replace('_22', ''): value for key, value in confirmed.items()}

# # Now `datasets` has keys in the desired format
# print(confirmed.keys())


In [None]:
cumulative_co2 = {
    'naivenaive': 0,
    'naive': 0,
    'cheer': 0,
}


for biggername in lookupdictionary:
#     current_section = inferred_sections[biggername]
#     full_csv = lookupdictionary[biggername]
    
        
    current_section = pd.read_csv(f"CanBikeCO_2/ceo_{lookupdictionary[biggername]}/analysis_inferred_section.csv", engine="python", on_bad_lines='skip')
    
    full_csv = pd.read_csv(f"CanBikeCO_2/ceo_{lookupdictionary[biggername]}/analysis_confirmed_trip.csv")
    
    if 'data_sensed_mode' not in current_section.columns:
        print('!'*100)
        print(biggername, 'does not have sensed mode')
        raise ValueError
    
    full_current_loaded = await load_or_fetch_data(f"co-{biggername}",
                                                 full_csv)

    print('!', biggername)
    
    section_naive = section_reader_naive_naive(current_section, biggername, full_csv)
    
    currentsum = section_naive['co2_emission_kg_naive_naive'].sum()

    
#     display(section_naive)
    print('?')

    returneddict = three_method_comparator(section_naive, full_current_loaded, biggername)
    cumulative_co2['naivenaive'] += returneddict['naivenaive']
    cumulative_co2['naive'] += returneddict['naive']
    cumulative_co2['cheer'] += returneddict['cheer']
#     print('sum for ', biggername, currentsum)

In [None]:
# lookupdictionary = {
#     "sc": "sc",
#     "boulder": "cc",
#     "fortcollins": "fc",
#     "pueblo": "pc",
#     "durango": "fc",
#     "vail": "vail",
# }

In [None]:
print(cumulative_co2)
three_method_comparator(inferred_canbike, canbikeco, "CanBikeCO", cumulative_co2)