In [None]:
import pandas as pd
import scaffolding
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

from pprint import pprint

from plots import *


import emcommon.diary.base_modes as emcb
import emcommon.metrics.footprint.util as emcfu
import emcommon.metrics.footprint.footprint_calculations as emcfc
import emcommon.util as emcommonutil

In [None]:
# Set the display option to show all columns
pd.set_option('display.max_columns', None)


In [None]:
df_pur = pd.read_csv(r'auxiliary_files/purpose_labels.csv')
df_re = pd.read_csv(r'auxiliary_files/mode_labels.csv')
df_ei = pd.read_csv(r'auxiliary_files/energy_intensity.csv')

#dictionaries:
dic_pur = dict(zip(df_pur['purpose_confirm'],df_pur['bin_purpose'])) # bin purpose
dic_re  = dict(zip(df_re['replaced_mode'],df_re['mode_clean'])) # bin modes
dic_fuel = dict(zip(df_ei['mode'],df_ei['fuel']))

In [None]:

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other',dic_re)
dic_pur = defaultdict(lambda: 'Other',dic_pur)
dic_fuel = defaultdict(lambda: 'Other',dic_fuel)

mode_of_interest = "E-bike"

In [None]:
#
#
# EMISSION PATCH , EGRID NAME FOR 2021
# REMOVE WHEN IT IS UPDATED
async def get_egrid_region(coords: list[float, float], year: int):
    """
    Get the eGRID region at the given coordinates in the year.
    """
    global latest_egrid_year
    if year < 2018:
        Log.warn(f"eGRID data not available for {year}. Using 2018.")
        return await emcfu.get_egrid_region(coords, 2018)
    if latest_egrid_year is not None and year > latest_egrid_year:
        return await emcfu.get_egrid_region(coords, latest_egrid_year)
    try:
        geojson = await emcommonutil.read_json_resource(f"egrid{year}_subregions_5pct.json")
    except:
        if year > 2018:
            Log.warn(f"eGRID data not available for {year}. Trying {year-1}.")
            latest_egrid_year = year-1
            return await emcfu.get_egrid_region(coords, year-1)
        Log.error(f"eGRID lookup failed for {year}.")
        return None
    region_feature = emcfu.get_feature_containing_point(coords, geojson)
    if region_feature is not None:
        properties = region_feature['properties']
        region_name = properties.get('name') or properties.get('SUBRGN')
        if region_name:
            return region_name
    Log.warn(f"An eGRID region was not found for coords {coords} in year {year}.")
    return None

emcfu.get_egrid_region = get_egrid_region

In [None]:
def no_traceback_handler(exception_type, exception, traceback):
    print("%s: %s" % (exception_type.__name__, exception), file=sys.stderr)



# CASE 2 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
unique_users = lambda df: len(df.user_id.unique()) if "user_id" in df.columns else 0
trip_label_count = lambda s, df: len(df[s].dropna()) if s in df.columns else 0


def add_energy_labels(expanded_ct, df_ei, dic_fuel):
    """ Inputs:
    expanded_ct = dataframe of trips that has had Mode_confirm and Replaced_mode added
    dic/df_* = label mappings for energy impact and fuel
    """
    expanded_ct['Mode_confirm_fuel']= expanded_ct['Mode_confirm'].map(dic_fuel)
    expanded_ct = energy_intensity(expanded_ct, df_ei, 'Mode_confirm')
    expanded_ct = energy_footprint_kWH(expanded_ct, 'distance_miles', 'Mode_confirm')
    expanded_ct = CO2_footprint_lb(expanded_ct, 'distance_miles', 'Mode_confirm')
    return expanded_ct

def add_energy_impact(expanded_ct, df_ei, dic_fuel):
    # Let's first calculate everything for the mode confirm
    # And then calculate everything for the replaced mode
    expanded_ct = add_energy_labels(expanded_ct, df_ei, dic_fuel)
    expanded_ct['Replaced_mode_fuel']= expanded_ct['Replaced_mode'].map(dic_fuel)
    expanded_ct = energy_intensity(expanded_ct, df_ei, 'Replaced_mode')
    # and then compute the impacts
    expanded_ct = energy_impact_kWH(expanded_ct, 'distance_miles')
    expanded_ct = CO2_impact_lb(expanded_ct, 'distance_miles')
    return expanded_ct

def get_quality_text(before_df, after_df, mode_of_interest=None, include_test_users=False):
    """ Inputs:
    before_df = dataframe prior to filtering (usually participant_ct_df)
    after_df = dataframe after filtering (usually expanded_ct)
    mode_of_interest = optional detail to include in the text string
    """
    # CASE 1 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
    after_pct = (len(after_df) * 100) / len(before_df) if len(before_df) != 0 else np.nan
    cq = (len(after_df), unique_users(after_df), len(before_df), unique_users(before_df),
        after_pct, )
    interest_str = mode_of_interest + ' ' if mode_of_interest is not None else ''
    total_str = 'confirmed' if mode_of_interest is not None else ''
    user_str = 'testers and participants' if include_test_users else 'users'
    quality_text = f"Based on %s confirmed {interest_str}trips from %d {user_str}\nof %s total {total_str} trips from %d users (%.2f%%)" % cq
    print(quality_text)
    return quality_text



def data_quality_check(expanded_ct):
    '''1. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was pilot_ebike.
       2. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was same_mode.
       3. Replace same_mode for the mode_confirm for Energy Impact Calcualtion.'''

    # TODO: This is only really required for the initial data collection around the minipilot
    # in subsequent deployes, we removed "same mode" and "pilot_ebike" from the options, so the
    # dataset did not contain of these data quality issues

    if 'replaced_mode' in expanded_ct.columns:
        expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'pilot_ebike')].index, inplace=True)
        expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'same_mode')].index, inplace=True)
        expanded_ct['replaced_mode'] = np.where(expanded_ct['replaced_mode'] == 'same_mode',expanded_ct['mode_confirm'], expanded_ct['replaced_mode'])
    
    return expanded_ct

def unit_conversions(df):
    df['distance_miles']= df["distance"]*0.00062 #meters to miles

def energy_intensity(trip_df,mode_intensity_df,col):
    """ Inputs:
    trip_df = dataframe with data
    mode_intensity_df = dataframe with energy/cost/time factors
    col = the column for which we want to map the intensity
    """

    mode_intensity_df = mode_intensity_df.copy()
    mode_intensity_df[col] = mode_intensity_df['mode']
    dic_ei_factor = dict(zip(mode_intensity_df[col],mode_intensity_df['energy_intensity_factor']))
    dic_CO2_factor = dict(zip(mode_intensity_df[col],mode_intensity_df['CO2_factor']))
    dic_ei_trip = dict(zip(mode_intensity_df[col],mode_intensity_df['(kWH)/trip']))

    trip_df['ei_'+col] = trip_df[col].map(dic_ei_factor)
    trip_df['CO2_'+col] = trip_df[col].map(dic_CO2_factor)
    trip_df['ei_trip_'+col] = trip_df[col].map(dic_ei_trip)
    return trip_df

def energy_footprint_kWH(df,distance_miles,col):
    """ Inputs:
    df = dataframe with data
    distance = distance in miles
    col = Replaced_mode or Mode_confirm
    """
    conditions_col = [(df[col+'_fuel'] =='gasoline'),
                       (df[col+'_fuel'] == 'diesel'),
                       (df[col+'_fuel'] == 'electric')]
    gasoline_col = (df[distance_miles]*df['ei_'+col]*0.000293071) # 1 BTU = 0.000293071 kWH
    diesel_col   = (df[distance_miles]*df['ei_'+col]*0.000293071)
    electric_col = (df[distance_miles]*df['ei_'+col])+ df['ei_trip_'+col]
    values_col = [gasoline_col,diesel_col,electric_col]
    df[col+'_EI(kWH)'] = np.select(conditions_col, values_col)
    return df

def energy_impact_kWH(df,distance_miles):
    if 'Mode_confirm_EI(kWH)' not in df.columns:
        print("Mode confirm footprint not found, computing before impact")
        df = energy_footprint_kWH(df, distance_miles, "Mode_confirm")
    df = energy_footprint_kWH(df, distance_miles, "Replaced_mode")
    df['Energy_Impact(kWH)']  = round((df['Replaced_mode_EI(kWH)'] - df['Mode_confirm_EI(kWH)']),3)
    return df

def CO2_footprint_lb(df, distance_miles, col):
    """ Inputs:
    df = dataframe with data
    distance = distance in miles
    col = Replaced_mode or Mode_confirm
    """
    conditions_col = [(df[col+'_fuel'] =='gasoline'),
                       (df[col+'_fuel'] == 'diesel'),
                       (df[col+'_fuel'] == 'electric')]
   
    gasoline_col = (df[distance_miles]*df['ei_'+col]*0.000001)* df['CO2_'+col]
    diesel_col   = (df[distance_miles]*df['ei_'+col]*0.000001)* df['CO2_'+col]
    electric_col = (((df[distance_miles]*df['ei_'+col])+df['ei_trip_'+col])*0.001)*df['CO2_'+col]

    values_col = [gasoline_col,diesel_col,electric_col]
    df[col+'_lb_CO2'] = np.select(conditions_col, values_col)
    return df
    
def CO2_impact_lb(df,distance_miles):
    if 'Mode_confirm_lb_CO2' not in df.columns:
        print("Mode confirm footprint not found, computing before impact")
        df = CO2_footprint_lb(df, distance_miles, "Mode_confirm")
    df = CO2_footprint_lb(df, distance_miles, "Replaced_mode")
    df['CO2_Impact(lb)']  = round((df['Replaced_mode_lb_CO2'] - df['Mode_confirm_lb_CO2']),3)
    return df

In [None]:
#path configuration
to_data_folder = "PaperVizualizations/Data/abby_ceo/sc" #data folder, where composite data was written from the TSDC_data file

In [None]:
# df = pd.read_csv('viz_scripts/abby_ceo/sc/analysis_confirmed_trip.csv')
# we are using smart commute data. zip with this csv taken from onedrive.
df = pd.read_csv(to_data_folder + "/analysis_confirmed_trip.csv")
# expanded_ct_2=pd.read_csv(to_data_folder + "/tsdc_filtered_merged_trips.csv")

In [None]:
print(df.columns)

In [None]:
# we dont have demographic questions.

In [None]:
df.rename(columns={
    'user_id_socio': 'user_id',
    'please_identify_which_category_represents_your_total_household_': 'HHINC',
    'how_many_motor_vehicles_are_owned_leased_or_available_for_regul': 'VEH',
    ' how_many_motor_vehicles_are_owned_leased_or_available_for_regul': 'VEH',
    'how_many_motor_vehicles_are_owned_leased_or_available_for_regul ': 'VEH',
    'in_which_year_were_you_born?': 'AGE',
    'including_yourself_how_many_people_live_in_your_home?': 'HHSIZE',
    'how_many_children_under_age_18_live_in_your_home?': 'CHILDREN',
    'what_is_your_gender?': 'GENDER',
    'if_you_were_unable_to_use_your_household_vehicles_which_of_the_': 'available_modes',
    'are_you_a_student?': 'STUDENT',
    'data_duration': 'duration',
    'data_distance': 'distance'
}, inplace=True, errors='ignore')

In [None]:
df_mapped = df.copy()

#first, add the cleaned mode
df_mapped['Mode_confirm']= df_mapped['data_user_input_mode_confirm'].map(dic_re)

#second, add the cleaned replaced mode ASSUMES PROGRAM
df_mapped['Replaced_mode']= df_mapped['data_user_input_replaced_mode'].map(dic_re)

#third, add the cleaned purpose
df_mapped['Trip_purpose']= df_mapped['data_user_input_purpose_confirm'].map(dic_pur)

In [None]:
# Get timestamp from known year/month/day aggregated to days
df_mapped.rename(columns={'data_start_local_dt_year':'year','data_start_local_dt_month':'month','data_start_local_dt_day':'day'}, inplace=True)
df_mapped['date_time'] = pd.to_datetime(df_mapped[['year','month','day']])

# Fix age (birth year to age)
# df_mapped['AGE'] = 2022 - df_mapped['AGE']

# Number of workers (size of HH - kids)
# df_mapped['WORKERS'] = df_mapped['HHSIZE'] - df_mapped['CHILDREN']

# Duration in minutes (hours to minutes)
df_mapped['duration'] = df_mapped['duration'] / 60

# duration in miles (meters to miles)
df_mapped['distance_miles'] = df_mapped['distance'] * 0.0006213712

# E-bike/not E-Bike variable
# df_mapped['is_ebike'] = "E-Bike Trips"
# df_mapped.loc[df_mapped['Mode_confirm']!="E-bike", 'is_ebike'] = "Non E-Bike Trips"

In [None]:
expanded_ct = df_mapped

In [None]:
expanded_ct = add_energy_impact(expanded_ct, df_ei, dic_fuel) if len(expanded_ct) > 0 else expanded_ct

In [None]:
expanded_ct[expanded_ct['Mode_confirm'] == 'E-bike']
# expanded_ct.head(30)

In [None]:
expanded_ct['Mode_confirm'].unique()

In [None]:
if 'mode_confirm' in expanded_ct.columns:
    mode_of_interest_df = expanded_ct.query(f"mode_confirm == '{mode_of_interest}'")
    debug_df.loc[f"{mode_of_interest}_trips"] = len(mode_of_interest_df)
    debug_df.loc[f"{mode_of_interest}_trips_with_replaced_mode"] = scaffolding.trip_label_count("Replaced_mode", mode_of_interest_df)

In [None]:
# CASE 2 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
data_eb = expanded_ct.query(f"Mode_confirm == '{mode_of_interest}'") if "Mode_confirm" in expanded_ct.columns else expanded_ct

In [None]:
quality_text = get_quality_text(expanded_ct, data_eb, mode_of_interest)

In [None]:
# ebei : ebike energy impact
plot_title_no_quality=f"Sketch of Energy Impact of {mode_of_interest} trips"
file_name =f'sketch_energy_impact_{mode_of_interest}%s'
    
ebei=data_eb.groupby('Replaced_mode').agg({'Energy_Impact(kWH)': ['sum', 'mean']},)
ebei.columns = ['Sketch of Total Energy_Impact(kWH)', 'Sketch of Average Energy_Impact(kWH)']
ebei= ebei.reset_index()
ebei = ebei.sort_values(by=['Sketch of Total Energy_Impact(kWH)'], ascending=False)
ebei['boolean'] = ebei['Sketch of Total Energy_Impact(kWH)'] > 0
net_energy_saved = round(sum(ebei['Sketch of Total Energy_Impact(kWH)']), 2)

x = ebei['Sketch of Total Energy_Impact(kWH)']
y = ebei['Replaced_mode']
color =ebei['boolean']

plot_title= plot_title_no_quality+f"\n Contribution by replaced mode towards a total of {net_energy_saved}(kWH)\n"+quality_text
energy_impact(x,y,color,plot_title,file_name)
alt_text = store_alt_text_bar(pd.DataFrame(x.values,y), file_name, plot_title)

In [None]:
latest_egrid_year = None
latest_ntd_year = None
async def get_egrid_region(coords: list[float, float], year: int):
    """
    Get the eGRID region at the given coordinates in the year.
    """
    global latest_egrid_year
    if year < 2018:
        Log.warn(f"eGRID data not available for {year}. Using 2018.")
        return await get_egrid_region(coords, 2018)
    if latest_egrid_year is not None and year > latest_egrid_year:
        return await get_egrid_region(coords, latest_egrid_year)
    try:
        geojson = await emcfu.read_json_resource(f"egrid{year}_subregions_5pct.json")
    except:
        if year > 2018:
            Log.warn(f"eGRID data not available for {year}. Trying {year-1}.")
            latest_egrid_year = year-1
            return await get_egrid_region(coords, year-1)
        Log.error(f"eGRID lookup failed for {year}.")
        return None
    region_feature = emcfu.get_feature_containing_point(coords, geojson)
    if region_feature is not None:
        properties = region_feature['properties']
        region_name = properties.get('name') or properties.get('SUBRGN')
        if region_name:
            return region_name
    Log.warn(f"An eGRID region was not found for coords {coords} in year {year}.")
    return None

In [None]:
# emcb.BASE_MODES['pilot_ebike'] = emcb.BASE_MODES['E_BIKE']
# print(emcb.BASE_MODES['pilot_ebike'])
#
# the default json does not use pilot_ebike, so manually go in and add it to that json.
#
default_json = await emcfu.read_json_resource('label-options.default.json')
default_json['MODE'].append({"value":"pilot_ebike", "base_mode":"E_BIKE"})
# print(default_json)

In [None]:
# print(expanded_ct['Mode_confirm'].unique())

In [None]:
expanded_ct = expanded_ct.dropna(subset=['data_user_input_mode_confirm'])
expanded_ct = expanded_ct.dropna(subset=['data_user_input_replaced_mode'])
expanded_ct = expanded_ct[expanded_ct['Mode_confirm'] != 'Not a Trip']

# expanded_ct['data_user_input_mode_confirm'] = expanded_ct['data_user_input_mode_confirm'].replace('pilot_ebike', 'e-bike')


print(expanded_ct['data_user_input_mode_confirm'].unique())

In [None]:
async def get_commute_data(row, labels, ):
#     mode_footprint = emcb.get_rich_mode(mode)["footprint"]
#     mode_footprint = passed_mode_footprint['footprint']
#     distance = row[distance_col] * 1609.34 # converts miles to meters #might not need to convert - one of the distance cols (raw one) should be in meters
#     year = row['year']
#     coords = row["geometry"].centroid
    # long, lat
    # can we make this assumption to use start?
#     coords = [row['data_start_loc_longitude'], row['data_start_loc_latitude']] #double check if start or end are used in production
    #coords = [row['data_start_loc_latitude'], row['data_start_loc_longitude']] #double check if start or end are used in production
    #
#     uace = None #find uace code similar to egrid region
#     egrid_region = await get_egrid_region(coords, year)
#     passengers = 1
    
    trip_object = {
        "_id": row['_id'],
        "distance": row['distance'],
        "start_fmt_time": row['data_start_fmt_time'],
        "start_loc": {"coordinates": [row['data_start_loc_longitude'], row['data_start_loc_latitude']]},
        "user_input": {"mode_confirm": row['data_user_input_mode_confirm'],
                       "replaced_mode_confirm": row['data_user_input_replaced_mode']
                      }
    }


#     footprint = await emcfc.calc_footprint(mode_footprint, distance, year, coords, uace, egrid_region, passengers)
#     try:
#     print(row['data_user_input_replaced_mode'])
#     print(":)")
    footprint = await emcfc.calc_footprint_for_trip(trip_object, labels, )
    replaced_footprint = await emcfc.calc_footprint_for_trip(trip_object, labels, 'replaced_mode')
#     except ValueError:
#         print('!'*90, end='')
#         print(row['data_user_input_mode_confirm'])
#         return
    return {
        'footprint': footprint,
        'replaced_footprint': replaced_footprint
    }



for index, row in expanded_ct.iterrows():
    # change the base mode and the value
    # derived from user inputted label
    # dervied from 'drove alone' and take that user label and
    # look up what is associated. 
    #
    # get_rich_mode_for_value(value: str, label_options: dict)
    # use the column data_user_input_mode_confirm with the above.
    # {"value":"drove_alone", "base_mode":"CAR", "passengers": 1 }
    #
 
    commute_data = await get_commute_data(row, default_json, )
#     print(type(commute_data['footprint'])[0])
    if ('kg_co2' in commute_data['footprint'][0]) and ('kwh' in commute_data['footprint'][0]):
        expanded_ct.loc[index, 'cheer_kg_co2'] = commute_data['footprint'][0]['kg_co2']
        expanded_ct.loc[index, 'cheer_kwh'] = commute_data['footprint'][0]['kwh']
        expanded_ct.loc[index, 'cheer_replaced_kg_co2'] = commute_data['replaced_footprint'][0]['kg_co2']
        expanded_ct.loc[index, 'cheer_replaced_kwh'] = commute_data['replaced_footprint'][0]['kwh']
        
        for uncertain_column in ['kg_co2_uncertain', 'kwh_uncertain']:
            if uncertain_column in commute_data['replaced_footprint'][0]:
                expanded_ct.loc[index, f'cheer_replaced_{uncertain_column}'] = commute_data['replaced_footprint'][0][uncertain_column]
            
        
        expanded_ct.loc[index, 'Mode_confirm_kg_CO2'] = row['Mode_confirm_lb_CO2'] * 0.453592

#     else:
#         raise ValueError

    pprint(commute_data)


    #calculate energy and emissions saved
#     row["cheer_e_saved"] = row["work_car"][0]["kwh"] - x["work_ecar"][0]["kwh"], axis = 1)
#     row["cheer_co2_saved"] = test_df.apply(lambda x : x["work_car"][0]["kg_co2"] - x["work_ecar"][0]["kg_co2"], axis = 1)

#     break



In [None]:
expanded_ct.head(30)

In [None]:
# print(expanded_ct.head(3).to_string())

In [None]:
# remove 'Air' and 'Other' from Mode_confirm,
# because they really offset the others and made the others unreadable
expanded_ct_filtered = expanded_ct[~expanded_ct['Mode_confirm'].isin(['Air', 'Other'])]

# group by 'Mode_confirm' and calculate the mean for each mode
grouped_data_filtered = expanded_ct_filtered.groupby('Mode_confirm')[['Mode_confirm_kg_CO2', 'cheer_kg_co2']].mean()

# plotting two bars for each mode: one for Mode_confirm_kg_CO2 (naive) and one for cheer_kg_co2
plt.figure(figsize=(10, 6))
ax = grouped_data_filtered.plot(kind='bar', figsize=(10, 6), color=['lightblue', 'orange'])

plt.title('Comparison of CO2 Emissions for Each Mode of Transportation')
plt.ylabel('Average CO2 Emissions (kg)')
plt.xlabel('Mode of Transportation')

plt.legend(['Naive Calculation (Mode_confirm_kg_CO2)', 'CHEER Calculation (cheer_kg_co2)'], loc='upper right')

# show plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
