In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from uuid import UUID

import matplotlib.pyplot as plt

import sys
sys.path.append('/Users/mallen2/alternate_branches/eval-compatible-server/e-mission-server')

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.core.wrapper.user as ecwu

import confusion_matrix_handling as cm_handling
from confusion_matrix_handling import MODE_MAPPING_DICT
import get_EC
import helper_functions as hf

import sklearn.model_selection as skm

from sklearn.model_selection import KFold
from sklearn import linear_model

METERS_TO_MILES = 0.000621371 # 1 meter = 0.000621371 miles
ECAR_PROPORTION = 0 #0.01 #~1% of cars on the road are electric.

df_EI = pd.read_csv(r'Public_Dashboard/auxiliary_files/energy_intensity.csv') # r stands for raw string, only matters if the path is on Windows

In [None]:
# run "Store_expanded_labeled_trips.ipynb" first.
%store -r expanded_labeled_trips 
unit_dist_MCS_df = pd.read_csv("unit_distance_MCS.csv").set_index("moment")
energy_dict = cm_handling.get_energy_dict(df_EI)

# maybe make this a function?

# here I'm referring to car_load_factor: the number that we divide the drove alone energy intensity by
# for r = 1, car_load_factor is 4/3.
r = 1
car_load_factor = (r+1)/(r+0.5)     
gas_car_drove_alone_EI = energy_dict["Gas Car, drove alone"]
e_car_drove_alone_EI = energy_dict["E-car, drove alone"]
# NOTE: MODE_MAPPING_DICT (seen in confusion_matrix_handling.py) is currently mapping 'drove_alone' 
# (from before the OpenPATH update that distinguished E-car and gas car) to 'Gas Car, drove alone.'
# MODE_MAPPING_DICT = {'drove_alone': 'Gas Car, drove alone', ...

# Include the chance of electric car in the sensed energy intensity.
sensed_car_drove_alone_EI = ECAR_PROPORTION*e_car_drove_alone_EI + (1-ECAR_PROPORTION)*gas_car_drove_alone_EI

# Include the chance that a sensed car trip is shared ride.
sensed_car_EI = sensed_car_drove_alone_EI/car_load_factor

energy_dict.update({"Car, sensed": sensed_car_EI})

In [None]:
expanded_labeled_trips = hf.drop_unwanted_trips(expanded_labeled_trips,drop_not_a_trip=False)
expanded_labeled_trips = hf.get_primary_modes(expanded_labeled_trips,energy_dict,MODE_MAPPING_DICT)
print('Here are the number of labeled trips remaining in each program dataset:')
expanded_labeled_trips.program.value_counts()

In [None]:
# Get the confusion matrices and then the EI moments from those.
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')

android_confusion = cm_handling.collapse_confusion_matrix(android_confusion, rows_to_collapse={"Train": ["Train"]}, columns_to_collapse={})
ios_confusion = cm_handling.collapse_confusion_matrix(ios_confusion, rows_to_collapse={"Train": ["Train"]}, columns_to_collapse={})

expanded_labeled_trips['distance_miles'] = expanded_labeled_trips.distance*METERS_TO_MILES
EI_length_cov = 0

In [None]:
# if you forget this step, the error for expected may be different, 
# since you might be relying on a different saved version of the EI_moments_dataframe
android_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(android_confusion,energy_dict)
ios_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(ios_confusion,energy_dict)
os_EI_moments_map = {'ios': ios_EI_moments_df, 'android': android_EI_moments_df}
energy_consumption_df = get_EC.compute_all_EC_values(expanded_labeled_trips,unit_dist_MCS_df,energy_dict,android_EI_moments_df,ios_EI_moments_df, \
    EI_length_cov, print_info=False)

In [None]:
program_percent_error_map = hf.get_program_percent_error_map(energy_consumption_df)
percent_error_df = pd.DataFrame(program_percent_error_map,index=[0])
percent_error_markdown = percent_error_df.round(2).to_markdown()
print(percent_error_markdown)

### Using aggregate distances when computing variance

In [None]:
# This cell plots the user labeled and expected aggregate energy consumptions on the left and right, respectively.
program_n_sd_map_aggregate_distance = hf.plot_estimates_with_sd_by_program(energy_consumption_df,os_EI_moments_map,unit_dist_MCS_df,variance_method="aggregate_distance")
print(f"number of standard deviations from mean: {program_n_sd_map_aggregate_distance}")

### Using the sum of individual variances

In [None]:
# This cell plots the user labeled and expected aggregate energy consumptions on the left and right, respectively.
# It uses the old method of getting aggregate variance (add up individual variances, no covariance term).
program_n_sd_map_individual_trips = hf.plot_estimates_with_sd_by_program(energy_consumption_df,os_EI_moments_map,unit_dist_MCS_df,variance_method="independent individual trips")
print(f"number of standard deviations from mean: {program_n_sd_map_individual_trips}")

### Bar chart version!

In [None]:
# This cell plots the user labeled and expected aggregate energy consumptions on the left and right, respectively.
hf.plot_aggregate_EC_bar_chart(energy_consumption_df)

## Now try with a Bayes update.

In [None]:
prior_probs_prespecified = {"Gas Car, sensed": 0.85, "Pilot ebike": 0.05}
prior_probs = prior_probs_prespecified.copy()
n_other_modes = len(android_confusion.index) - len(prior_probs_prespecified)
probability_remaining = 1 - sum(prior_probs_prespecified.values())
prior_probs.update({x: probability_remaining/n_other_modes for x in android_confusion.index if x not in prior_probs_prespecified.keys()})
#prior_probs = {x: 1/len(android_confusion.index) for x in android_confusion.index} # if you want a uniform prior.

android_EI_moments_with_Bayes_update_df = cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(android_confusion,energy_dict, prior_probs)
ios_EI_moments_with_Bayes_update_df = cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(ios_confusion,energy_dict, prior_probs)
os_EI_moments_with_Bayes_update_map = {'ios': ios_EI_moments_with_Bayes_update_df, 'android': android_EI_moments_with_Bayes_update_df}
energy_consumption_with_Bayes_update_df = get_EC.compute_all_EC_values(expanded_labeled_trips,unit_dist_MCS_df,energy_dict,\
    android_EI_moments_with_Bayes_update_df,\
    ios_EI_moments_with_Bayes_update_df, \
    EI_length_cov, print_info=False)

In [None]:
# Aggregate distance method
# This cell plots the user labeled and expected aggregate energy consumptions on the left and right, respectively. 
program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_with_Bayes_update_df,os_EI_moments_with_Bayes_update_map, unit_dist_MCS_df, variance_method='aggregate_distance')
print(f"number of standard deviations from mean: {program_n_sd_map}")

In [None]:
# Spatial covariance method
program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_df,os_EI_moments_map,unit_dist_MCS_df,variance_method="aggregate_distance")
print(f"number of standard deviations from mean: {program_n_sd_map}")

## What are the proportions of each mode in mobilitynet?

In [None]:
all_mobilitynet_trips = android_confusion + ios_confusion
durations_in_modes = all_mobilitynet_trips.sum(axis=1)
mobility_net_mode_proportions = durations_in_modes/all_mobilitynet_trips.sum().sum() #this gives the proportions of each mode in mobilitynet
print(mobility_net_mode_proportions.round(2).to_latex())

In [None]:
mobility_net_mode_proportions

In [None]:
# Demonstration that dividing each android confusion column by its column sum is 
# equivalent to assuming that the data has the same prior mode distribution as the android trips in mobility net
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')#+ ios_confusion

durations_in_modes = android_confusion.sum(axis=1)
prior_mode_probs = durations_in_modes/all_mobilitynet_trips.sum().sum()

p_predicted_given_actual = android_confusion.divide(android_confusion.sum(axis=1), axis='rows')

likelihood_times_priors = p_predicted_given_actual.multiply(pd.Series(prior_mode_probs), axis='rows')
normalizing_constants = likelihood_times_priors.sum(axis='rows')
prob_actual_given_predicted_df = likelihood_times_priors.divide(normalizing_constants, axis='columns').copy()
prob_actual_given_predicted_df

In [None]:
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')#+ ios_confusion

all_mobilitynet_trips = android_confusion + ios_confusion
durations_in_modes = all_mobilitynet_trips.sum(axis=1)
prior_mode_probs = durations_in_modes/all_mobilitynet_trips.sum().sum()

p_predicted_given_actual = android_confusion.divide(android_confusion.sum(axis=1), axis='rows')

likelihood_times_priors = p_predicted_given_actual.multiply(pd.Series(prior_mode_probs), axis='rows')
normalizing_constants = likelihood_times_priors.sum(axis='rows')
prob_actual_given_predicted_df = likelihood_times_priors.divide(normalizing_constants, axis='columns').copy()
prob_actual_given_predicted_df

In [None]:
def plot_energy_consumption_by_mode(energy_consumption_df,program_name, main_mode_labels = ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike','train','taxi','free_shuttle']):
    df = energy_consumption_df.copy()
    program_main_mode_labels = [x for x in main_mode_labels if x in df.mode_confirm.unique()] # 4c doesn't have train before May 2022.

    program_main_modes_EC = df.groupby('mode_confirm').sum().loc[program_main_mode_labels]
    program_main_modes_EC = program_main_modes_EC[['expected','user_labeled']] # 'predicted',

    program_main_modes_EC.plot(kind='barh')
    program_percent_error_expected = 100*hf.relative_error(df.expected.sum(),df.user_labeled.sum())
    plt.xlabel('Energy consumption (kWH)')
    plt.ylabel('user labeled mode')
    plt.title(f"Energy consumption estimates by user labeled mode for {program_name}\nCustom mode labels not shown\n(full % error for expected: {program_percent_error_expected:.2f})")

plot_energy_consumption_by_mode(energy_consumption_df,'all CEO + stage', main_mode_labels = ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike','train','taxi','free_shuttle'])

In [None]:
# what percent of all ceo trips are ebike?
energy_consumption_df.groupby('mode_confirm').sum()['distance']['pilot_ebike']/energy_consumption_df.distance.sum()

In [None]:
# this version of show_bootstrap shows the distribution of errors rather than expected values.
def show_bootstrap(df,program,os_EI_moments_map,unit_dist_MCS_df, print_results):
    print(program)
    NB = 300
    df = df.copy()
    df = df.set_index("_id")
    aggregate_EC_estimates = []
    aggregate_EC_actual = []
    sd_list = []
    for j in range(0,NB):
        bootstrap_idx = np.random.choice(df.index,len(df),replace=True)
        bootstrap_sample = df.loc[bootstrap_idx]
        aggregate_EC_estimates.append(sum(bootstrap_sample.expected))
        aggregate_EC_actual.append(sum(bootstrap_sample.user_labeled))
        sd_list.append(get_EC.get_totals_and_errors(df, os_EI_moments_map, unit_dist_MCS_df, include_autocovariance=False)['aggregate_sd'])

    aggregate_EC_estimates = np.array(aggregate_EC_estimates)
    aggregate_EC_actual = np.array(aggregate_EC_actual)
    errors = aggregate_EC_estimates - aggregate_EC_actual

    totals_and_errors = get_EC.get_totals_and_errors(df, os_EI_moments_map, unit_dist_MCS_df, include_autocovariance=False)
    total_expected = totals_and_errors['total_expected']
    boot_mean = np.mean(aggregate_EC_estimates)
    sd = totals_and_errors["aggregate_sd"]
    boot_sd = np.sqrt(np.var(aggregate_EC_estimates))

    if print_results == True:
        plt.hist(errors)

        print(f'our estimate: {total_expected:.2f}\nTrue value: {totals_and_errors["total_user_labeled"]:.2f}\nMean of bootstrap estimates: {boot_mean:.2f}')
        print(f'our error: {sum(energy_consumption_df.expected - energy_consumption_df.user_labeled):.2f}')

        print(f'our 1 sd interval: {total_expected - sd:.2f},{total_expected + sd:.2f}')
        print(f'bootstrap 1 sd interval: {boot_mean - boot_sd:.2f},{boot_mean + boot_sd:.2f}')
        print(f'bootstrap 2 sd interval: {boot_mean - 2*boot_sd:.2f},{boot_mean + 2*boot_sd:.2f}')

    # I want to know: how does the error compare to the standard deviation each time?
    return abs(errors)/np.array(sd_list)

#show_bootstrap(energy_consumption_df,'all', os_EI_moments_map, unit_dist_MCS_df)

In [None]:
all_errors_over_sd = np.array()
for program in energy_consumption_df.program.unique():
    error_over_sd = show_bootstrap(energy_consumption_df[energy_consumption_df.program == program],program, os_EI_moments_map, unit_dist_MCS_df, print_results= False)
    all_errors_over_sd = np.append(all_errors_over_sd, error_over_sd)

In [None]:
plt.hist(error_over_sd); plt.show()

In [None]:
# Calculate the mean and sd for all user labeled and for all sensed:
mean_EC_all_sensing = sum(elt_with_errors_outliers_removed['expected'])
mean_EC_all_user_labeled = sum(elt_with_errors_outliers_removed['user_labeled'])

sd_sensed = np.sqrt(sum(elt_with_errors_outliers_removed['confusion_var']))
sd_users = np.sqrt(sum(elt_with_errors_outliers_removed['user_var']))

# Now calculate for various random splits of the data
# 10^3 NMC takes 10 seconds on vail to create all 4 splits.
proportion_sensed = [0.2,0.4,0.6,0.8]
NMC = 100#**2#**3

summary_df_map = {}
for ps in proportion_sensed:
    
    mean_EC_agg = []
    var_EC_agg = []
    error_EC_agg = []
    for j in range(0,NMC):
        rand_state = np.random.RandomState(1+j)

        # Split the labeled trips into a user labeled dataframe and a sensed dataframe
        user_labeled,sensed  = skm.train_test_split(elt_with_errors_outliers_removed , 
                                                    test_size = ps, # sensed
                                                    train_size = 1-ps,  # user_labeled
                                                    random_state= rand_state)
        mean_EC_sensed, var_EC_sensed = sum(sensed['expected']), sum(sensed['confusion_var'])
        
        mean_EC_user_labeled, var_EC_user_labeled = sum(user_labeled['user_labeled']), sum(user_labeled['user_var'])

        # Get the total mean and variance for the current iteration and add it to a list.
        current_aggregate_EC = mean_EC_sensed + mean_EC_user_labeled
        mean_EC_agg.append(current_aggregate_EC)
        var_EC_agg.append(var_EC_sensed + var_EC_user_labeled)
        error_EC_agg.append(current_aggregate_EC - mean_EC_all_user_labeled)

        sd_EC_agg = np.sqrt(np.array(var_EC_agg))

    summary_df_map[ps] = pd.DataFrame({"mean": mean_EC_agg, "sd": sd_EC_agg, 'error': error_EC_agg})
 
        # prop var sensed
        # prop var user labeled
average_summaries = {}
for ps in proportion_sensed:
    average_across_splits_mean = np.mean(summary_df_map[ps]["mean"])
    average_across_splits_sd = np.mean(summary_df_map[ps]["sd"])
    average_summaries[ps] = {"mean": average_across_splits_mean, "sd": average_across_splits_sd}

def get_interval(mean,sd):
    return [mean -sd, mean,mean + sd]

interval_sensed_vail = get_interval(mean_EC_all_sensing,sd_sensed)
interval_users_vail = get_interval(mean_EC_all_user_labeled,sd_users)

In [None]:
# on some datasets we can be more certain than others, and we might be less biased. But we don't know which ones that is the case for.
# how does spatial cov do on the largely ebike dataset? # how does spatial cov do with my uniform prior?

### Calculate variance with sections or calculate mean with primary mode.

In [None]:
def compute_aggregate_variance_with_total_distance_from_sections(df, os_EI_moments_map, unit_dist_MCS_df):
    '''
    Finds total distances in each predicted mode and uses those totals in the final aggregate variance calculation.

    df: trips dataframe with a primary_mode column.
    os_EI_moments_map: dictionary by operating system of energy intensity moments dataframes, which store mean and variance of energy intensity
        for each predicted mode.
    unit_dist_MCS_df: mean and variance estimates for unit distance trips.

    Returns the aggregate variance (var_total)
    '''
    var_total = 0

    for os in df.os.unique():
        single_os_trips = df[df.os == os].copy()

        # Get OS specific trip length info.
        mean_for_unit_L = unit_dist_MCS_df[os]["mean"]
        var_for_unit_L = unit_dist_MCS_df[os]["var"]

        sensed_mode_distance_map = {}
        for _,ct in energy_consumption_df.iterrows():
            sections_lengths = np.array(ct["section_distances"])*METERS_TO_MILES 
            for i, mode in enumerate(ct["section_modes"]):
                if mode not in sensed_mode_distance_map.keys():
                    sensed_mode_distance_map[mode] = 0
                # Add to the total distance traveled in this mode.
                sensed_mode_distance_map[mode] += sections_lengths[i]
                
        for mode in sensed_mode_distance_map.keys():
            mean_L = sensed_mode_distance_map[mode]*mean_for_unit_L
            var_L = sensed_mode_distance_map[mode]**2 * var_for_unit_L  
            mode = 'train' if mode == 'air_or_hsr' else mode

            mean_EI = os_EI_moments_map[os]["mean(EI)"][mode] 
            var_EI = os_EI_moments_map[os]["variance(EI)"][mode] 

            var_total += var_EI*mean_L**2 + var_L*mean_EI**2 #+ 2*cov(EI,L)*mean_EI*mean_L  if including covariance

    return var_total

In [None]:
var_based_on_sections = compute_aggregate_variance_with_total_distance_from_sections(expanded_labeled_trips, os_EI_moments_map, unit_dist_MCS_df)

# the version that I've been using takes the total distance in miles for the trip and groups by primary mode.
var_based_on_primary_modes = get_EC.compute_aggregate_variance(expanded_labeled_trips, os_EI_moments_map, unit_dist_MCS_df)
np.sqrt(var_based_on_sections), np.sqrt(var_based_on_primary_modes) # bigger difference than I expected.

In [None]:
def get_expected_EC_for_one_trip(ct, unit_dist_MCS_df,android_EI_moments, ios_EI_moments, EI_length_covariance):
    '''
    Finds the expected mean energy consumption and variance for a single trip.
    The variance is calculated with variance propagation of the energy intensity variance and the trip length variance.

    ct:                     confirmed trip. A row of a labeled trips dataframe.
    unit_dist_MCS_df:       dataframe containing the mean and variance of trip length for a 1 unit long trip, for both operating systems.
    energy_dict:            dictionary by mode of energy intensities in kWH.
    android_EI_moments:     dataframe of energy intensity mean and variance for each mode sensed with android.
    ios_EI_moments:         dataframe of energy intensity mean and variance for each mode sensed with ios.

    EI_length_covariance:   (assumed to be 0). covariance between trip energy intensity and trip length.
        To use this, we would need to either find a value based on past user labels or estimate this with sensed energy consumption.
        I'm not sure whether this should be different for different sensed modes (ie, use a covariance conditional on sensed mode), 
        since knowing the sensed mode tells us more information about the energy consumption than if we had no knowledge.

        With all CEO + stage user labels, I estimated EI_length covariance as 1.29.
        You might also need to add the covariance to each trip energy consumption estimate since E[XY] = E[X]E[Y] + cov(X,Y), 
        but this could drastically overestimate energy consumption if we use a covariance of 1.2 for every trip, 
        which would be similar to assigning every trip to drove alone or a higher intensity mode.
    
    Returns the expected energy consumption mean and variance as a tuple of floats: trip_mean_EC, trip_var_EC.
    '''
    #Initialize trip energy consumption
    trip_mean_EC = 0
    trip_var_EC = 0

    # Get operating system
    os = ct['os']

    # Get OS specific trip length info.
    mean_for_unit_L = unit_dist_MCS_df[os]["mean"]
    var_for_unit_L = unit_dist_MCS_df[os]["var"]

    # Get trip mode info.
    # Get segments for the trip.
    n_sections = len(ct["section_modes"])
    section_modes = ct["section_modes"]
    sections_lengths = np.array(ct["section_distances"])*METERS_TO_MILES   # 1 meter = 0.000621371 miles

    mean_L = sections_lengths*mean_for_unit_L
    var_L = sections_lengths**2 * var_for_unit_L  
        
    for current_section in range(0,n_sections):
        # EI mean and variance.
        # Perhaps it would be better to keep the moments in the same file?

        # Later: switch to a map style function.
        mean_EI, var_EI = get_EI_moments_for_trip(section_modes[current_section],os,android_EI_moments,ios_EI_moments)

        # Propagate variance for the trip
        mean_EC = mean_L[current_section]*mean_EI
        var_EC = var_EI*mean_L[current_section]**2 + var_L[current_section]*mean_EI**2 + 2*EI_length_covariance*mean_EI*mean_L[current_section]

        # Add to total - follows from assumed independence of section errors.  # Might want to consider dependence between sections.
        trip_mean_EC += mean_EC
        trip_var_EC += var_EC

    return trip_mean_EC, trip_var_EC

In [None]:
def get_expected_based_on_primary_mode_for_one_trip(ct, unit_dist_MCS_df, android_EI_moments, ios_EI_moments):

    # Get operating system
    os = ct['os']

    # Get OS specific trip length info.
    mean_for_unit_L = unit_dist_MCS_df[os]["mean"]
    var_for_unit_L = unit_dist_MCS_df[os]["var"]

    # Get primary mode
    longest_section_distance = max(ct["section_distances"])*METERS_TO_MILES
    primary_mode = ct["section_modes"][ct["section_distances"]==longest_section_distance]

    # in case there are ever tied longest sections.
    # pick the most energy intensive mode.
    if isinstance(primary_mode,list): 
        mini_energy_dict = {x:energy_dict[MODE_MAPPING_DICT[x]] for x in primary_mode}
        primary_mode = max(mini_energy_dict, key=mini_energy_dict.get)
        print(f"found a tie for longest section. Choosing {primary_mode}")

    mean_EI, var_EI = get_EC.get_EI_moments_for_trip(primary_mode,os,android_EI_moments,ios_EI_moments)

    # use longest section distance or use trip distance?
    # mean_EC = longest_section_distance*mean_for_unit_L*mean_EI
    mean_EC = ct["distance_miles"]*mean_for_unit_L*mean_EI

    return mean_EC
    
def compute_all_EC_values_from_primary_mode(df, unit_dist_MCS_df,energy_dict, android_EI_moments_df,ios_EI_moments_df):

    print("Computing energy consumption for each trip.")
    expected = []

    for _,ct in df.iterrows():
        # Calculate expected energy consumption
        trip_expected = get_expected_based_on_primary_mode_for_one_trip(ct,unit_dist_MCS_df,android_EI_moments_df,ios_EI_moments_df)
        expected.append(trip_expected)

    # Append the values to expanded_labeled_trips
    elt = df.copy()  # elt: expanded labeled trips
    elt['expected'] = expected

    return elt

In [None]:
primary_mode_energy_df = compute_all_EC_values_from_primary_mode(expanded_labeled_trips,unit_dist_MCS_df,energy_dict, android_EI_moments_df,ios_EI_moments_df)

In [None]:
primary_mode_energy_df.expected.sum(), energy_consumption_df.expected.sum()

In [None]:
primary_mode_energy_df.expected.sum(), energy_consumption_df.expected.sum()

In [None]:
energy_consumption_df['primary_expected'] = primary_mode_energy_df.expected
energy_consumption_df[['user_labeled', 'primary_expected', 'section_modes', 'mode_confirm','section_distances', 'distance' ]]

### What happens with a modeshare approach?
The resulting variance is very large.

In [None]:
#def calculate_EC_by_mode_share(df,android_confusion,ios_confusion)

# 1. split into android and ios dataframes
# 2. compute for each.
import itertools

# find a matrix of prob predicted given actual.
collapsed_confusion_matrix = cm_handling.collapse_confusion_matrix(android_confusion, rows_to_collapse={"Train": ["Train"]}, columns_to_collapse={})
duration_sensed_as_car_given_actual_ebike = 0.4*collapsed_confusion_matrix.loc['Pilot ebike'].sum()
collapsed_confusion_matrix.at['Pilot ebike','bicycling'] -=duration_sensed_as_car_given_actual_ebike
collapsed_confusion_matrix.at['Pilot ebike','car'] += duration_sensed_as_car_given_actual_ebike
prob_actual_given_predicted_df = collapsed_confusion_matrix/collapsed_confusion_matrix.sum(axis=0)

sensed_mode_distances = energy_consumption_df.groupby("primary_mode").sum().distance_miles

expected_EC = 0
var_EC = 0
primary_mode_distance_estimates = {}
primary_mode_dist_sd_estimates = {}
for primary_mode in sensed_mode_distances.index:
    if primary_mode == 'air_or_hsr':
        primary_mode = 'train'
    primary_mode_distance_estimates[primary_mode] = 0
    var_primary_mode_total = 0
    for gt_mode in prob_actual_given_predicted_df.index:
        prob_gt_mode = prob_actual_given_predicted_df.loc[gt_mode][primary_mode]
        expected_distance = prob_gt_mode * sensed_mode_distances[primary_mode] * 1.04  # 1.04 is from unit dist MCS

        primary_mode_distance_estimates[primary_mode] += expected_distance        

        # n = len(expanded_labeled_trips[expanded_labeled_trips.primary_mode == primary_mode])*
        var_in_mode_distance = prob_gt_mode*(1 - prob_gt_mode)*sensed_mode_distances[primary_mode]**2

        var_primary_mode_total += var_in_mode_distance    
        expected_EC += energy_dict[MODE_MAPPING_DICT[primary_mode]]*expected_distance

        var_EC += var_in_mode_distance #*energy_dict[MODE_MAPPING_DICT[primary_mode]]**2


    primary_mode_dist_sd_estimates[primary_mode] = np.sqrt(var_primary_mode_total)
print(f"Expected, user labeled {expected_EC:.2f}, {energy_consumption_df.user_labeled.sum():.2f}")
print(f"sd: {np.sqrt(var_EC):.2f}")
# Based on this, using mode share by distance for EC is not great.