In [None]:
%pip install git+https://github.com/JGreenlee/e-mission-common.git


In [None]:
import pandas as pd
import scaffolding
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import asyncio
import zipfile
import tarfile

from pprint import pprint

from plots import *


import emcommon.diary.base_modes as emcb
import emcommon.metrics.footprint.util as emcfu
import emcommon.metrics.footprint.footprint_calculations as emcfc
import emcommon.util as emcommonutil

In [None]:
# Set the display option to show all columns
pd.set_option('display.max_columns', None)


In [None]:
df_pur = pd.read_csv(r'auxiliary_files/purpose_labels.csv')
df_re = pd.read_csv(r'auxiliary_files/mode_labels.csv')
df_ei = pd.read_csv(r'auxiliary_files/energy_intensity.csv')

#dictionaries:
dic_pur = dict(zip(df_pur['purpose_confirm'],df_pur['bin_purpose'])) # bin purpose
dic_re  = dict(zip(df_re['replaced_mode'],df_re['mode_clean'])) # bin modes
dic_fuel = dict(zip(df_ei['mode'],df_ei['fuel']))

In [None]:

# convert a dictionary to a defaultdict
dic_re = defaultdict(lambda: 'Other',dic_re)
dic_pur = defaultdict(lambda: 'Other',dic_pur)
dic_fuel = defaultdict(lambda: 'Other',dic_fuel)

mode_of_interest = "E-bike"

In [None]:
def no_traceback_handler(exception_type, exception, traceback):
    print("%s: %s" % (exception_type.__name__, exception), file=sys.stderr)



# CASE 2 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
unique_users = lambda df: len(df.user_id.unique()) if "user_id" in df.columns else 0
trip_label_count = lambda s, df: len(df[s].dropna()) if s in df.columns else 0


def add_energy_labels(expanded_ct, df_ei, dic_fuel):
    """ Inputs:
    expanded_ct = dataframe of trips that has had Mode_confirm and Replaced_mode added
    dic/df_* = label mappings for energy impact and fuel
    """
    expanded_ct['Mode_confirm_fuel']= expanded_ct['Mode_confirm'].map(dic_fuel)
    expanded_ct = energy_intensity(expanded_ct, df_ei, 'Mode_confirm')
    expanded_ct = energy_footprint_kWH(expanded_ct, 'distance_miles', 'Mode_confirm')
    expanded_ct = CO2_footprint_lb(expanded_ct, 'distance_miles', 'Mode_confirm')
    return expanded_ct

def add_energy_impact(expanded_ct, df_ei, dic_fuel):
    # Let's first calculate everything for the mode confirm
    # And then calculate everything for the replaced mode
    expanded_ct = add_energy_labels(expanded_ct, df_ei, dic_fuel)
    expanded_ct['Replaced_mode_fuel']= expanded_ct['Replaced_mode'].map(dic_fuel)
    expanded_ct = energy_intensity(expanded_ct, df_ei, 'Replaced_mode')
    # and then compute the impacts
    expanded_ct = energy_impact_kWH(expanded_ct, 'distance_miles')
    expanded_ct = CO2_impact_lb(expanded_ct, 'distance_miles')
    return expanded_ct

def get_quality_text(before_df, after_df, mode_of_interest=None, include_test_users=False):
    """ Inputs:
    before_df = dataframe prior to filtering (usually participant_ct_df)
    after_df = dataframe after filtering (usually expanded_ct)
    mode_of_interest = optional detail to include in the text string
    """
    # CASE 1 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
    after_pct = (len(after_df) * 100) / len(before_df) if len(before_df) != 0 else np.nan
    cq = (len(after_df), unique_users(after_df), len(before_df), unique_users(before_df),
        after_pct, )
    interest_str = mode_of_interest + ' ' if mode_of_interest is not None else ''
    total_str = 'confirmed' if mode_of_interest is not None else ''
    user_str = 'testers and participants' if include_test_users else 'users'
    quality_text = f"Based on %s confirmed {interest_str}trips from %d {user_str}\nof %s total {total_str} trips from %d users (%.2f%%)" % cq
    print(quality_text)
    return quality_text



def data_quality_check(expanded_ct):
    '''1. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was pilot_ebike.
       2. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was same_mode.
       3. Replace same_mode for the mode_confirm for Energy Impact Calcualtion.'''

    # TODO: This is only really required for the initial data collection around the minipilot
    # in subsequent deployes, we removed "same mode" and "pilot_ebike" from the options, so the
    # dataset did not contain of these data quality issues

    if 'replaced_mode' in expanded_ct.columns:
        expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'pilot_ebike')].index, inplace=True)
        expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'same_mode')].index, inplace=True)
        expanded_ct['replaced_mode'] = np.where(expanded_ct['replaced_mode'] == 'same_mode',expanded_ct['mode_confirm'], expanded_ct['replaced_mode'])
    
    return expanded_ct

def unit_conversions(df):
    df['distance_miles']= df["distance"]*0.00062 #meters to miles

def energy_intensity(trip_df,mode_intensity_df,col):
    """ Inputs:
    trip_df = dataframe with data
    mode_intensity_df = dataframe with energy/cost/time factors
    col = the column for which we want to map the intensity
    """

    mode_intensity_df = mode_intensity_df.copy()
    mode_intensity_df[col] = mode_intensity_df['mode']
    dic_ei_factor = dict(zip(mode_intensity_df[col],mode_intensity_df['energy_intensity_factor']))
    dic_CO2_factor = dict(zip(mode_intensity_df[col],mode_intensity_df['CO2_factor']))
    dic_ei_trip = dict(zip(mode_intensity_df[col],mode_intensity_df['(kWH)/trip']))

    trip_df['ei_'+col] = trip_df[col].map(dic_ei_factor)
    trip_df['CO2_'+col] = trip_df[col].map(dic_CO2_factor)
    trip_df['ei_trip_'+col] = trip_df[col].map(dic_ei_trip)
    return trip_df

def energy_footprint_kWH(df,distance_miles,col):
    """ Inputs:
    df = dataframe with data
    distance = distance in miles
    col = Replaced_mode or Mode_confirm
    """
    conditions_col = [(df[col+'_fuel'] =='gasoline'),
                       (df[col+'_fuel'] == 'diesel'),
                       (df[col+'_fuel'] == 'electric')]
    gasoline_col = (df[distance_miles]*df['ei_'+col]*0.000293071) # 1 BTU = 0.000293071 kWH
    diesel_col   = (df[distance_miles]*df['ei_'+col]*0.000293071)
    electric_col = (df[distance_miles]*df['ei_'+col])+ df['ei_trip_'+col]
    values_col = [gasoline_col,diesel_col,electric_col]
    df[col+'_EI(kWH)'] = np.select(conditions_col, values_col)
    return df

def energy_impact_kWH(df,distance_miles):
    if 'Mode_confirm_EI(kWH)' not in df.columns:
        print("Mode confirm footprint not found, computing before impact")
        df = energy_footprint_kWH(df, distance_miles, "Mode_confirm")
    df = energy_footprint_kWH(df, distance_miles, "Replaced_mode")
    df['Energy_Impact(kWH)']  = round((df['Replaced_mode_EI(kWH)'] - df['Mode_confirm_EI(kWH)']),3)
    return df

def CO2_footprint_lb(df, distance_miles, col):
    """ Inputs:
    df = dataframe with data
    distance = distance in miles
    col = Replaced_mode or Mode_confirm
    """
    conditions_col = [(df[col+'_fuel'] =='gasoline'),
                       (df[col+'_fuel'] == 'diesel'),
                       (df[col+'_fuel'] == 'electric')]
   
    gasoline_col = (df[distance_miles]*df['ei_'+col]*0.000001)* df['CO2_'+col]
    diesel_col   = (df[distance_miles]*df['ei_'+col]*0.000001)* df['CO2_'+col]
    electric_col = (((df[distance_miles]*df['ei_'+col])+df['ei_trip_'+col])*0.001)*df['CO2_'+col]

    values_col = [gasoline_col,diesel_col,electric_col]
    df[col+'_lb_CO2'] = np.select(conditions_col, values_col)
    return df
    
def CO2_impact_lb(df,distance_miles):
    if 'Mode_confirm_lb_CO2' not in df.columns:
        print("Mode confirm footprint not found, computing before impact")
        df = CO2_footprint_lb(df, distance_miles, "Mode_confirm")
    df = CO2_footprint_lb(df, distance_miles, "Replaced_mode")
    df['CO2_Impact(lb)']  = round((df['Replaced_mode_lb_CO2'] - df['Mode_confirm_lb_CO2']),3)
    return df

In [None]:
#path configuration
to_data_folder = "PaperVizualizations/Data/abby_ceo/sc" #data folder, where composite data was written from the TSDC_data file

In [None]:
# df = pd.read_csv('viz_scripts/abby_ceo/sc/analysis_confirmed_trip.csv')
# we are using smart commute data. zip with this csv taken from onedrive.
df = pd.read_csv(to_data_folder + "/analysis_confirmed_trip.csv")
# expanded_ct_2=pd.read_csv(to_data_folder + "/tsdc_filtered_merged_trips.csv")

In [None]:
print(df.columns)

In [None]:
from pprint import pprint
pprint(dic_re)

In [None]:
async def calculate_naive_and_cheer(passed_df: pd.DataFrame):
    # we dont have demographic questions.
    passed_df.rename(columns={
        'user_id_socio': 'user_id',
        'please_identify_which_category_represents_your_total_household_': 'HHINC',
        'how_many_motor_vehicles_are_owned_leased_or_available_for_regul': 'VEH',
        ' how_many_motor_vehicles_are_owned_leased_or_available_for_regul': 'VEH',
        'how_many_motor_vehicles_are_owned_leased_or_available_for_regul ': 'VEH',
        'in_which_year_were_you_born?': 'AGE',
        'including_yourself_how_many_people_live_in_your_home?': 'HHSIZE',
        'how_many_children_under_age_18_live_in_your_home?': 'CHILDREN',
        'what_is_your_gender?': 'GENDER',
        'if_you_were_unable_to_use_your_household_vehicles_which_of_the_': 'available_modes',
        'are_you_a_student?': 'STUDENT',
        'data_duration': 'duration',
        'data_distance': 'distance'
    }, inplace=True, errors='ignore')
    
    df_mapped = passed_df.copy()

    #first, add the cleaned mode
    df_mapped['Mode_confirm']= df_mapped['data_user_input_mode_confirm'].map(dic_re)

    #second, add the cleaned replaced mode ASSUMES PROGRAM
    df_mapped['Replaced_mode']= df_mapped['data_user_input_replaced_mode'].map(dic_re)

    #third, add the cleaned purpose
    df_mapped['Trip_purpose']= df_mapped['data_user_input_purpose_confirm'].map(dic_pur)
    
    # Get timestamp from known year/month/day aggregated to days
    df_mapped.rename(columns={'data_start_local_dt_year':'year','data_start_local_dt_month':'month','data_start_local_dt_day':'day'}, inplace=True)
    df_mapped['date_time'] = pd.to_datetime(df_mapped[['year','month','day']])

    # Fix age (birth year to age)
    # df_mapped['AGE'] = 2022 - df_mapped['AGE']

    # Number of workers (size of HH - kids)
    # df_mapped['WORKERS'] = df_mapped['HHSIZE'] - df_mapped['CHILDREN']

    # Duration in minutes (hours to minutes)
    df_mapped['duration'] = df_mapped['duration'] / 60

    # duration in miles (meters to miles)
    df_mapped['distance_miles'] = df_mapped['distance'] * 0.0006213712

    # E-bike/not E-Bike variable
    # df_mapped['is_ebike'] = "E-Bike Trips"
    # df_mapped.loc[df_mapped['Mode_confirm']!="E-bike", 'is_ebike'] = "Non E-Bike Trips"
    
    expanded_ct = df_mapped
    expanded_ct = add_energy_impact(expanded_ct, df_ei, dic_fuel) if len(expanded_ct) > 0 else expanded_ct
    expanded_ct[expanded_ct['Mode_confirm'] == 'E-bike']
    # expanded_ct.head(30)
    
    if 'mode_confirm' in expanded_ct.columns:
        mode_of_interest_df = expanded_ct.query(f"mode_confirm == '{mode_of_interest}'")
        debug_df.loc[f"{mode_of_interest}_trips"] = len(mode_of_interest_df)
        debug_df.loc[f"{mode_of_interest}_trips_with_replaced_mode"] = scaffolding.trip_label_count("Replaced_mode", mode_of_interest_df)
        
    # CASE 2 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867
    data_eb = expanded_ct.query(f"Mode_confirm == '{mode_of_interest}'") if "Mode_confirm" in expanded_ct.columns else expanded_ct
    
    quality_text = get_quality_text(expanded_ct, data_eb, mode_of_interest)
    
    # ebei : ebike energy impact
    plot_title_no_quality=f"Sketch of Energy Impact of {mode_of_interest} trips"
    file_name =f'sketch_energy_impact_{mode_of_interest}%s'

    ebei=data_eb.groupby('Replaced_mode').agg({'Energy_Impact(kWH)': ['sum', 'mean']},)
    ebei.columns = ['Sketch of Total Energy_Impact(kWH)', 'Sketch of Average Energy_Impact(kWH)']
    ebei= ebei.reset_index()
    ebei = ebei.sort_values(by=['Sketch of Total Energy_Impact(kWH)'], ascending=False)
    ebei['boolean'] = ebei['Sketch of Total Energy_Impact(kWH)'] > 0
    net_energy_saved = round(sum(ebei['Sketch of Total Energy_Impact(kWH)']), 2)

    x = ebei['Sketch of Total Energy_Impact(kWH)']
    y = ebei['Replaced_mode']
    color =ebei['boolean']

    plot_title= plot_title_no_quality+f"\n Contribution by replaced mode towards a total of {net_energy_saved}(kWH)\n"+quality_text
#     energy_impact(x,y,color,plot_title,file_name)
    alt_text = store_alt_text_bar(pd.DataFrame(x.values,y), file_name, plot_title)
    

    
    # emcb.BASE_MODES['pilot_ebike'] = emcb.BASE_MODES['E_BIKE']
    # print(emcb.BASE_MODES['pilot_ebike'])
    #
    # the default json does not use pilot_ebike, so manually go in and add it to that json.
    #
    default_json = await emcfu.read_json_resource('label-options.default.json')
    default_json['MODE'].append({"value":"pilot_ebike", "base_mode":"E_BIKE"})
    # print(default_json)
    
    expanded_ct = expanded_ct.dropna(subset=['data_user_input_mode_confirm'])
    expanded_ct = expanded_ct.dropna(subset=['data_user_input_replaced_mode'])
    expanded_ct = expanded_ct[expanded_ct['Mode_confirm'] != 'Not a Trip']

    # expanded_ct['data_user_input_mode_confirm'] = expanded_ct['data_user_input_mode_confirm'].replace('pilot_ebike', 'e-bike')


    print(expanded_ct['data_user_input_mode_confirm'].unique())
    
    async def get_commute_data(row, labels, ):
    #     mode_footprint = emcb.get_rich_mode(mode)["footprint"]
    #     mode_footprint = passed_mode_footprint['footprint']
    #     distance = row[distance_col] * 1609.34 # converts miles to meters #might not need to convert - one of the distance cols (raw one) should be in meters
    #     year = row['year']
    #     coords = row["geometry"].centroid
        # long, lat
        # can we make this assumption to use start?
    #     coords = [row['data_start_loc_longitude'], row['data_start_loc_latitude']] #double check if start or end are used in production
        #coords = [row['data_start_loc_latitude'], row['data_start_loc_longitude']] #double check if start or end are used in production
        #
    #     uace = None #find uace code similar to egrid region
    #     egrid_region = await get_egrid_region(coords, year)
    #     passengers = 1

        trip_object = {
            "_id": row['_id'],
            "distance": row['distance'],
            "start_fmt_time": row['data_start_fmt_time'],
            "start_loc": {"coordinates": [row['data_start_loc_longitude'], row['data_start_loc_latitude']]},
            "user_input": {"mode_confirm": row['data_user_input_mode_confirm'],
                           "replaced_mode_confirm": row['data_user_input_replaced_mode']
                          }
        }


    #     footprint = await emcfc.calc_footprint(mode_footprint, distance, year, coords, uace, egrid_region, passengers)
    #     try:
    #     print(row['data_user_input_replaced_mode'])
    #     print(":)")
        footprint = await emcfc.calc_footprint_for_trip(trip_object, labels, )
        replaced_footprint = await emcfc.calc_footprint_for_trip(trip_object, labels, 'replaced_mode')
    #     except ValueError:
    #         print('!'*90, end='')
    #         print(row['data_user_input_mode_confirm'])
    #         return
        return {
            'footprint': footprint,
            'replaced_footprint': replaced_footprint
        }

    # asyncio concurrency
    async def get_commute_data_task(row, index, expanded_ct, labels):
        commute_data = await get_commute_data(row, labels)

        # update fields with values from `commute_data` in expanded_ct directly
        if 'kg_co2' in commute_data['footprint'][0] and 'kwh' in commute_data['footprint'][0]:
            expanded_ct.at[index, 'cheer_kg_co2'] = commute_data['footprint'][0]['kg_co2']
            expanded_ct.at[index, 'cheer_kwh'] = commute_data['footprint'][0]['kwh']
            expanded_ct.at[index, 'cheer_replaced_kg_co2'] = commute_data['replaced_footprint'][0]['kg_co2']
            expanded_ct.at[index, 'cheer_replaced_kwh'] = commute_data['replaced_footprint'][0]['kwh']

            for uncertain_column in ['kg_co2_uncertain', 'kwh_uncertain']:
                if uncertain_column in commute_data['replaced_footprint'][0]:
                    expanded_ct.at[index, f'cheer_replaced_{uncertain_column}'] = commute_data['replaced_footprint'][0][uncertain_column]

        expanded_ct.at[index, 'Mode_confirm_kg_CO2'] = row['Mode_confirm_lb_CO2'] * 0.453592

        # sanity check
        if row['Mode_confirm'] == 'Walk' and ('cheer_kg_co2' in row) and (row['cheer_kg_co2'] != 0):
            print('!!!!!!!!!!!!!!' * 2)


    # Check if indexes are unique
    print('unique or not?')
    expanded_ct = expanded_ct.reset_index()
    print(expanded_ct.index.is_unique)

    # Create tasks for each row in expanded_ct
    tasks = [
        get_commute_data_task(row, index, expanded_ct, default_json)
        for index, row in expanded_ct.iterrows()
    ]

    # Run all tasks concurrently
    await asyncio.gather(*tasks)


        #calculate energy and emissions saved
    #     row["cheer_e_saved"] = row["work_car"][0]["kwh"] - x["work_ecar"][0]["kwh"], axis = 1)
    #     row["cheer_co2_saved"] = test_df.apply(lambda x : x["work_car"][0]["kg_co2"] - x["work_ecar"][0]["kg_co2"], axis = 1)

    #     break
    
    return expanded_ct



In [None]:

def plotter(passed_expanded_ct: pd.DataFrame, name_of_dataset: str):

    # remove 'Air' and 'Other' from Mode_confirm,
    # because they really offset the others and made the others unreadable
    expanded_ct_filtered = passed_expanded_ct[~passed_expanded_ct['Mode_confirm'].isin(['Air', 'Other'])]

    # group by 'Mode_confirm' and calculate the mean for each mode
    grouped_data_filtered = expanded_ct_filtered.groupby('Mode_confirm')[['Mode_confirm_kg_CO2', 'cheer_kg_co2']].mean()

    # Define a function to create and save plots with specified y-axis scale
    def create_and_save_plot(scale: str):
        # plotting two bars for each mode: one for Mode_confirm_kg_CO2 (naive) and one for cheer_kg_co2
        plt.figure(figsize=(10, 6))
        ax = grouped_data_filtered.plot(kind='bar', figsize=(10, 6), color=['lightblue', 'orange'])

        # Title update based on scale
        plt.title(f'Comparison of CO2 Emissions for Each Mode of Transportation\n{name_of_dataset} Dataset ({scale.capitalize()} Scale)')
        plt.ylabel('Average CO2 Emissions (kg)')
        plt.xlabel('Mode of Transportation')

        # Apply scale settings
        if scale == 'log':
            ax.set_yscale('log')
            ax.set_ylim(0.01, 13)  # Set limit for log scale
            ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:g}'))
        else:
            ax.set_ylim(0, 13)  # Limit for normal scale

        # Set legend location to top left
        plt.legend(['Naive Calculation (Mode_confirm_kg_CO2)', 'CHEER Calculation (cheer_kg_co2)'], loc='upper left')

        # show plot
        plt.xticks(rotation=45)
        plt.tight_layout()

        # Save the plot as PDF and PNG in the figures directory
        plt.savefig(f'figures/{name_of_dataset}_{scale}.pdf')
        plt.savefig(f'figures/{name_of_dataset}_{scale}.png')
        plt.close()

    # Create and save normal scale plot
    create_and_save_plot(scale='normal')
    
    # Create and save logarithmic scale plot
    create_and_save_plot(scale='log')


In [None]:
os.makedirs('figures', exist_ok=True)

In [None]:
async def load_or_fetch_data(dataset_name: str,
                             original_df: pd.DataFrame):
    """
    Loads a dataset from a pickle file if it exists, otherwise fetches it asynchronously and saves it.
    
    Parameters:
        dataset_name (str): The name of the dataset to load or fetch (used as the filename).
    
    Returns:
        pd.DataFrame: The loaded or fetched dataset as a DataFrame.
    """
    filename = f"{dataset_name}.pkl"
    
    if not os.path.exists(filename):
        # Fetch data asynchronously
        data = await calculate_naive_and_cheer(original_df)
        
        # Save to pickle file
        with open(filename, "wb") as file:
            pickle.dump(data, file)
    else:
        # Load data from pickle file
        with open(filename, "rb") as file:
            data = pickle.load(file)
    
    return data


smartcommute = await load_or_fetch_data("smart_commute",
                             df)

In [None]:
plotter(smartcommute, "Smart Commute")

In [None]:
#path configuration
to_data_parent = "PaperVizualizations/Data/abby_ceo" #path to the parent folder, should contain program subfolders
to_mini_data = "PaperVizualizations/Data/mini_pilot/data" #path to the mini data folder, contains an analysis trips file
# to_data_folder = "PaperVizualizations" #data folder, where composite data files will be written/read

#loop over
folders = ['4c', 'cc', 'fc', 'pc', 'sc', 'vail_22']
datasets = []

for program in folders:
    print('\nstarting with ', program)
    
    #create dataset with surveys and trips
    trips = pd.read_csv(to_data_parent + '/' + program + '/analysis_confirmed_trip.csv')
    print(len(trips), 'trips')
    print(trips.perno.nunique(), 'people')

    surveys = pd.read_csv(to_data_parent + '/' + program + '/' + program + '_survey_household.csv')
    print(len(surveys), 'surveys')

    #drop any null ids
    socio_data = surveys[~surveys['unique_user_id_autofilled_do_not_edit'].isnull()]
    print(len(socio_data), 'surveys after dropping null ids')

    #drop duplicates
    socio_data = socio_data.sort_values(by=['unique_user_id_autofilled_do_not_edit', 'timestamp'])
    socio_data.drop_duplicates(subset=['unique_user_id_autofilled_do_not_edit'], keep='last', inplace=True)
    print(len(socio_data),'surveys', socio_data['unique_user_id_autofilled_do_not_edit'].nunique(), 'users after dropping duplicates')

    #prepare survey ids for merging
    socio_data['user_id_socio'] = socio_data['unique_user_id_autofilled_do_not_edit'].astype(str)
    socio_data['user_id_socio'] = socio_data['user_id_socio'].str.strip() #remove leading or trailing whitespace!!
    socio_data['user_id_socio'] = socio_data['user_id_socio']
    socio_data = socio_data.drop(labels='unique_user_id_autofilled_do_not_edit', axis=1)
    
    
    #prepare trip ids for merging
    trips['user_id_socio'] = trips.perno.astype(str)
    trips['user_id_socio'] = trips['user_id_socio'].str.strip() #remove leading or trailing whitespace!!
    trips.user_id_socio = [i.replace('-','') for i in trips.user_id_socio] # remove all dashes from strings
    
    #merge the data
    data = trips.merge(socio_data, on='user_id_socio')
    print(len(data), 'trips after merging')
    print(data.user_id_socio.nunique(), 'people after merging')
    
    data['program'] = program.split('_')[0]
    
    #add to list of datasets
    datasets.append(data)
    
#merge them all together
full_data = pd.concat(datasets)
print(len(full_data), 'trips')
print(full_data.perno.nunique(), 'users')
# print(full_data.columns)

In [None]:
#
#
# TEMPORARY. REMOVE
#
# full_data = full_data[full_data['data_user_input_mode_confirm'] == 'walk'].head(55)
# full_data = full_data.head(20000)
# full_data

In [None]:
print(full_data['data_user_input_mode_confirm'].unique())

In [None]:
canbikeco = await load_or_fetch_data("canbikeco",
                                     full_data)

In [None]:
# debug cheer

In [None]:
whatsthis = canbikeco[(canbikeco['Mode_confirm'] == 'Walk') & (canbikeco['cheer_kg_co2'] > 0)]
# Drop specified columns from the DataFrame before selecting the first row
whatsthis.head(1)
# whatsthis_filtered = whatsthis.drop(columns=['cheer_kg_co2', 'cheer_kwh', 'cheer_replaced_kg_co2', 'cheer_replaced_kwh'])

# # Select the first row and convert it to a dictionary
# first_row_dict = whatsthis_filtered.iloc[0].to_dict()

# # Print the dictionary
# # print(first_row_dict)


In [None]:
# goodone = canbikeco[(canbikeco['Mode_confirm'] == 'Walk') & (canbikeco['cheer_kg_co2'] == 0)]
# print(goodone.head(1))

In [None]:

# # Concatenate the two DataFrames along rows (default)
# combined_df = pd.concat([whatsthis_filtered.iloc[0], goodone.head(1)], ignore_index=True)
# combined_df

In [None]:
plotter(canbikeco, "CanBikeCO")

In [None]:
a_test = canbikeco[canbikeco['Mode_confirm'] == 'Walk'].head(2)
a_test
# a_test = pd.DataFrame(a_test
#                      )
# row = a_test.iloc[0]
# print(a_test.to_string())

In [None]:
# print(replaced_footprint)


In [None]:
# Bull Durham

In [None]:
if not os.path.isdir('bull'):
    if not os.path.isfile('tsdc-2022-bull-e-bike-pilot-program-study-full-survey-data 2.zip'):
        raise ValueError("i need the zip, ask TSDC")
    else:
        # Unzip the file to the 'bull' directory
        with zipfile.ZipFile('tsdc-2022-bull-e-bike-pilot-program-study-full-survey-data 2.zip', 'r') as zip_ref:
            zip_ref.extractall('bull')
        print("Unzipped to 'bull' directory.")

In [None]:
bull = pd.read_csv('bull/tsdc-2022-bull-e-bike-pilot-program-study-full-survey-data 2/data/analysis_confirmed_trip.csv')

In [None]:
bull_loaded = await load_or_fetch_data("bull-durham",
                                     bull)

In [None]:
plotter(bull_loaded, "Bull (Durham, NC) eBike")

In [None]:
# MassCEC

In [None]:
# Check if 'MassCEC' directory exists
if not os.path.isdir('MassCEC'):
    # Check if the tar.gz file exists
    if not os.path.isfile('mass_jacques.tar.gz'):
        raise ValueError("I need the tar.gz file, ask TSDC")
    else:
        # Extract the tar.gz file to the 'MassCEC' directory
        with tarfile.open('mass_jacques.tar.gz', 'r:gz') as tar_ref:
            tar_ref.extractall('MassCEC')
        print("Extracted to 'MassCEC' directory.")

In [None]:
mass = pd.read_csv('MassCEC/mass_jacques/analysis_confirmed_trip.csv')
# print(mass.columns)
print(mass['data_user_input_mode_confirm'].unique())

In [None]:
# mass[mass['data_user_input_mode_confirm'] == 'bus'].head(55)
print(len(mass))

In [None]:
mass_loaded = await load_or_fetch_data("masscec",
                                       mass)


In [None]:
# mass_loaded[mass_loaded['Mode_confirm'] == 'bus'].head(55)
print(len(mass_loaded))

In [None]:
print(mass_loaded['Mode_confirm'].unique())

In [None]:
plotter(mass_loaded, "MassCEC")

In [None]:
# expanded_ct.head(30)

In [None]:
# print(expanded_ct.head(3).to_string())