In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from uuid import UUID

import matplotlib.pyplot as plt

import sys
sys.path.append('/Users/mallen2/alternate_branches/eval-compatible-server/e-mission-server')

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.core.wrapper.user as ecwu

import confusion_matrix_handling as cm_handling
from confusion_matrix_handling import MODE_MAPPING_DICT
import get_EC
import helper_functions as hf

import sklearn.model_selection as skm

from sklearn.model_selection import KFold
from sklearn import linear_model

METERS_TO_MILES = 0.000621371 # 1 meter = 0.000621371 miles

df_EI = pd.read_csv(r'Public_Dashboard/auxiliary_files/energy_intensity.csv') # r stands for raw string, only matters if the path is on Windows

In [None]:
import emission.core.get_database as edb

chosen_program = 'all'
all_user_list = []
programs_all = {}
for u in edb.get_uuid_db().find():         # add users to proper locations in programs 
    program = u["user_email"].split("_")[0]    # This info is in the Stage_uuids collection of the database
    uuid = u["uuid"]
    if program not in programs_all.keys(): programs_all[program] = []
    programs_all[program].append(uuid)
    all_user_list.append(uuid)

user_list = programs_all[chosen_program] if chosen_program is not 'all' else all_user_list
os_map = {}

for u in user_list:
    profile = ecwu.User(u).getProfile()
    if 'curr_platform' in profile:
        os_map[u] = profile['curr_platform']
    else:
        print("Removed a user who had no OS information.")
        user_list.remove(u) # Note: this removes u from programs_all[chosen_program] as well.
        no_os_user = u

In [None]:
# Collect all trips in the program specified earlier
# Then expand user inputs.
# You could instead load the file that "place_all_trips_in_pkl.py" generates
expanded_labeled_trips = hf.get_expanded_labeled_trips(user_list)

In [None]:
# When we sense air_or_hsr, what are the trips like?
sensed_modes_list = []
user_modes_list = []
section_lengths_list = []
air_lengths_list = []

for i,ct in expanded_labeled_trips.iterrows():
    if len(ct['section_modes']) ==0: continue
    if ("air_or_hsr" in ct['section_modes']) and (ct['mode_confirm'] != 'air') :
        sensed_modes_list.append(ct['section_modes'])
        user_modes_list.append(ct['mode_confirm'])
        section_lengths_list.append(ct['section_distances'])

        air_position = ct['section_modes'].index('air_or_hsr')

        air_lengths_list.append(ct['section_distances'][air_position])

sensed_air_df = pd.DataFrame({"sensed_sections": sensed_modes_list, "mode_confirm": user_modes_list, "air_length":air_lengths_list,"section_lengths": section_lengths_list})


In [None]:
sensed_air_df['air_length'].describe()

### What modes can we properly sense without substituting a "close enough" energy intensity?
drove alone, walk, bike,bus,train

In [None]:
expanded_labeled_trips.mode_confirm.value_counts()

In [None]:
# Base mode map for GIS. Not directly used in this notebook but nice to see.
gis_sensed_modes = {0 : 'no_sensed',    # UNKNOWN  #NOTE: this is important info to mention.
        1 : 'walking',    # WALKING
        2 : 'bicycling',    # BICYCLING
        3 : 'bus',        # BUS
        4 : 'train',      # TRAIN
        5 : 'car',        # CAR
        6 : 'air_or_hsr', # AIR_OR_HSR
        7 : 'subway',      # SUBWAY
        8 : 'train',      # TRAM
        9 : 'train',      # LIGHT_RAIL
}

# Get error related info
unit_dist_MCS_df = pd.read_csv("unit_distance_MCS.csv").set_index("moment")
#android_EI_moments_df = pd.read_csv("android_EI_moments.csv").set_index("mode")
#ios_EI_moments_df = pd.read_csv("ios_EI_moments.csv").set_index("mode")

# Dictionary of energy intensities in kWH/PMT
energy_dict = cm_handling.get_energy_dict(df_EI)
#%store -r energy_consumption_df # to save time

# sensed_car (maps via MODE_MAPPING_DICT) -> “Gas Car, sensed” in energy dict, 
# which is used for the ground truth car intensity in get_conditional_EI_expectation_and_variance(). 
# Then the sensed mode will show car, but the EI used will be based on a car with a 1.5 person load factor.
#drove_alone_EI = energy_dict["Gas Car, drove alone"]
#load_factor = 1#1.5
#energy_dict.update({"Gas Car, sensed": drove_alone_EI/load_factor})

In [None]:
# This dataframe was generated in place_all_trips_in_pkl.py
df = pd.read_pickle("/Users/mallen2/OpenPATH_Data/Sensing_sensitivity_analysis/expanded_labeled_trips.pickle")
expanded_labeled_trips = df.copy()#df[df['program'] == 'vail'].copy()

In [None]:
expanded_labeled_trips = hf.drop_unwanted_trips(expanded_labeled_trips,drop_not_a_trip=False)

# to double check when you're working later: 'not_a_trip' in expanded_labeled_trips.mode_confirm.unique()

expanded_labeled_trips = hf.get_primary_modes(expanded_labeled_trips,energy_dict,MODE_MAPPING_DICT)

In [None]:
# find out mode distance proportions for each program
program_proportions = pd.DataFrame(columns=['program','r', 'drove_alone_distance', 'shared_ride_distance','car_proportion', 'ebike_proportion', 'walk_proportion', 'drove_alone_proportion', 'shared_ride_proportion'])
for program in expanded_labeled_trips['program'].unique():
    program_df = expanded_labeled_trips[expanded_labeled_trips['program'] == program].copy()
    proportions = hf.get_ratios_for_dataset(program_df)
    proportions.update({'program': program})
    
    # Append row of proportions to the dataframe
    program_proportions = program_proportions.append(proportions, ignore_index=True)

# Get the proportions for the full dataset
proportions = hf.get_ratios_for_dataset(expanded_labeled_trips)
proportions.update({'program': 'all'})
program_proportions = program_proportions.append(proportions, ignore_index=True)

program_proportions = program_proportions.set_index("program")
#print(program_proportions.round(3).to_markdown())  # pip install tabulate
program_proportions

#### Use the confusion matrix to compute energy intensity expected values and variances given the predicted mode
New approach for using the confusion matrix: update probability of mode x based on what we predict.
To use the old approach, use get_conditional_EI_expectation_and_variance instead. This directly uses the columns for conditional distributions.

In [None]:
# What is the covariance between EI and length?
expanded_labeled_trips['distance_miles'] = expanded_labeled_trips.distance*METERS_TO_MILES
no_nan = expanded_labeled_trips[expanded_labeled_trips['mode_confirm'].notna()].copy()  

no_nan['EI_friendly_mode'] = no_nan.mode_confirm.map(MODE_MAPPING_DICT)

# for some reason EI friendly mode still ends up with nans, so I drop them again.
no_nan = no_nan[no_nan['EI_friendly_mode'].notna()].copy()
no_nan['EI'] = no_nan.EI_friendly_mode.map(energy_dict)

EI_length_cov_matrix = np.cov(no_nan[['EI','distance_miles']].transpose())
EI_length_cov = EI_length_cov_matrix[0][1]

mixed_var_covar_EI_length = get_EC.get_mixed_variance_covariance_term_for_nonlinear_variance_propagation(no_nan['EI'],no_nan['distance_miles'])
mixed_var_covar_length_EI = get_EC.get_mixed_variance_covariance_term_for_nonlinear_variance_propagation(no_nan['distance_miles'],no_nan['EI'])

EI_length_cov, mixed_var_covar_EI_length, mixed_var_covar_length_EI

In [None]:
def get_covariance(df):
    df = df.copy()
    df['distance_miles'] = df.distance*METERS_TO_MILES
    no_nan = df[df['mode_confirm'].notna()].copy()  

    no_nan['EI_friendly_mode'] = no_nan.mode_confirm.map(MODE_MAPPING_DICT)

    # for some reason EI friendly mode still ends up with nans, so I drop them again.
    no_nan = no_nan[no_nan['EI_friendly_mode'].notna()].copy()
    no_nan['EI'] = no_nan.EI_friendly_mode.map(energy_dict)

    EI_length_cov_matrix = np.cov(no_nan[['EI','distance_miles']].transpose())
    EI_length_cov = EI_length_cov_matrix[0][1]
    return EI_length_cov

# This shows that the covariance is not consistent across programs.
for program in expanded_labeled_trips.program.unique():
    program_df = expanded_labeled_trips[expanded_labeled_trips.program == program].copy()
    print(f"program, covariance: {program}, {get_covariance(program_df):.3f}")



In [None]:
for primary_mode in expanded_labeled_trips.primary_mode.unique():
    primary_mode_df = expanded_labeled_trips[expanded_labeled_trips.primary_mode == primary_mode].copy()
    print(f"primary mode, covariance: {primary_mode}, {get_covariance(primary_mode_df):.3f}")

In [None]:
# TODO idea: add prior mode distribution as input to get_Bayesian_conditional_EI_expectation_and_variance
# Get the confusion matrices and then the EI moments from those.
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')

r = 1  # 0.91 for vail, 0.71 for pc.
car_load_factor = (r+1)/(r+0.5)
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_load_factor})

# if you forget this step, the error for expected may be different, 
# since you might be relying on a different saved version of the EI_moments_dataframe
Bayesian_android_EI_moments_df = cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(android_confusion,energy_dict)
Bayesian_ios_EI_moments_df = cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(ios_confusion,energy_dict)

energy_consumption_df = get_EC.compute_all_EC_values(expanded_labeled_trips,unit_dist_MCS_df,energy_dict,Bayesian_android_EI_moments_df,Bayesian_ios_EI_moments_df, \
    EI_length_cov, mixed_var_covar_EI_length, mixed_var_covar_length_EI, print_info=False)
    
energy_consumption_df['distance_miles'] = energy_consumption_df.distance*METERS_TO_MILES
# %store energy_consumption_df

# using the old method with all ceo, including not a trip: Percent errors for expected and for predicted, including outliers: 7.08, 12.75

In [None]:
cov_sum = 0
if include_autocovariance == True:
    if n_trips >= 50: # not calculating autocov if we do not have a large timeseries sample
        for k in range(1,3):
            expected_autocov_k = lagged_auto_cov(df.expected,k)
            cov_sum += (n_trips - k)*2*expected_autocov_k
    # used median values of the autocovariance across users that I found. Maybe I should just ignore autocovariance for small sets of trips.
    else:
        cov_sum = (n_trips - 1)*2*5.5 + (n_trips - 2)*2*0.95  

In [None]:
user_auto_cov_map = {}
for user in energy_consumption_df.user_id.unique():
    df = energy_consumption_df[energy_consumption_df.user_id == user].copy()

    auto_cov_list = []
    for k in range(1,3):
        expected_autocov_k = get_EC.lagged_auto_cov(df.expected,k)
        auto_cov_list.append(expected_autocov_k)
    user_auto_cov_map[user] = auto_cov_list

energy_consumption_df['auto_cov'] = energy_consumption_df.user_id.map(user_auto_cov_map)
        

In [None]:
energy_consumption_df.auto_cov[0]

#### Look at how the covariance between EI and length impacts the variance estimates.
In the fold results datafrane, trip_level_within_2_sd_proportion tells you how many trip EC estimates in the test set for that fold are within 2 standard deviations from the user labeled value.
error_over_sd tells you how many standard deviations from the truth the aggregate EC estimate is

In [None]:
# More folds leads to smaller test sets and smaller aggregate energy consumption errors relative to the standard deviation.
kf = KFold(5, shuffle=True, random_state=2) 

# Get the confusion matrices and then the EI moments from those.
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')

r = 1  # 0.91 for vail, 0.71 for pc.
car_load_factor = (r+1)/(r+0.5)
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_load_factor})

# if you forget this step, the error for expected may be different, 
# since you might be relying on a different saved version of the EI_moments_dataframe
Bayesian_android_EI_moments_df = cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(android_confusion,energy_dict)
Bayesian_ios_EI_moments_df = cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(ios_confusion,energy_dict)

# This dataframe will store the proportions we calculate for each split of the training dataset.
fold_results = pd.DataFrame(columns=['trip_level_within_2_sd_proportion', 'total_expected_EC', 'total_user_labeled_EC','percent_error', 'signed_error', 'aggregate_sd'])

for train_index, test_index in kf.split(expanded_labeled_trips):
    # Calculate ratios.
    # Calculate EC percent error.
    # Append to dataframe
    training_set = expanded_labeled_trips.iloc[train_index].copy()
    test_set = expanded_labeled_trips.iloc[test_index].copy()

    training_set['distance_miles'] = training_set.distance*METERS_TO_MILES
    no_nan = training_set[training_set['mode_confirm'].notna()].copy()  

    no_nan['EI_friendly_mode'] = no_nan.mode_confirm.map(MODE_MAPPING_DICT)

    # for some reason EI friendly mode still ends up with nans, so I drop them again.
    no_nan = no_nan[no_nan['EI_friendly_mode'].notna()].copy()
    no_nan['EI'] = no_nan.EI_friendly_mode.map(energy_dict)

    EI_length_cov_matrix = np.cov(no_nan[['EI','distance_miles']].transpose())
    EI_length_cov = EI_length_cov_matrix[0][1]

    mixed_var_covar_EI_length = get_EC.get_mixed_variance_covariance_term_for_nonlinear_variance_propagation(no_nan['EI'],no_nan['distance_miles'])
    mixed_var_covar_length_EI = get_EC.get_mixed_variance_covariance_term_for_nonlinear_variance_propagation(no_nan['distance_miles'],no_nan['EI'])
    
    test_energy_consumption_df = get_EC.compute_all_EC_values(test_set,unit_dist_MCS_df,energy_dict,Bayesian_android_EI_moments_df,Bayesian_ios_EI_moments_df, 
                                                            EI_length_cov, mixed_var_covar_EI_length, mixed_var_covar_length_EI,  print_info = False)

    expected, predicted, actual = sum(test_energy_consumption_df['expected']), sum(test_energy_consumption_df['predicted']), sum(test_energy_consumption_df['user_labeled'])
    sd = np.sqrt(test_energy_consumption_df.confusion_var.sum())
    signed_error = expected - actual
    error_over_sd = abs(signed_error/sd)

    # k here is number of standard deviations.
    k = 2
    # count the number of times that the error magnitude is less than k times the standard deviation
    within_2_sd_proportion = sum(k*test_energy_consumption_df['confusion_sd'] > abs(test_energy_consumption_df['error_for_confusion']))/len(test_energy_consumption_df)
    percent_error = hf.relative_error(expected,actual)*100

    results_map = {'trip_level_within_2_sd_proportion': within_2_sd_proportion, 'total_expected_EC': expected, 
                'total_user_labeled_EC': actual, 'percent_error': percent_error, 'signed_error': signed_error, 
                'aggregate_sd': sd, 'error_over_sd': error_over_sd}
    fold_results = fold_results.append(results_map, ignore_index=True)

In [None]:
fold_results

In [None]:
print(f"Percent of the time that the error is above two standard deviations: {sum(fold_results.error_over_sd > 2)/len(fold_results)}")
print(f"Maximum error relative to standard deviation: {max(fold_results.error_over_sd):.2f}")

In [None]:
get_EC.get_totals_and_errors(program_df, include_autocovariance = True)

In [None]:
# What how many standard deviations are in the error for each program if we include covariance between EI and length?
n_standard_devs = []
program_size = []

for program in energy_consumption_df.program.unique():
    program_df = energy_consumption_df[energy_consumption_df.program == program]

    print(f"program: {program}")
    # TODO: change this so that only the autocov for each user is used?
    n_standard_devs.append(get_EC.get_totals_and_errors(program_df, include_autocovariance = True)['error_over_sd'])

    program_size.append(len(program_df))

    print(f"{program}, {n_standard_devs[-1]:.2f}, {len(program_df)}")
plt.scatter(program_size,n_standard_devs)

In [None]:
np.polyfit(program_size,n_standard_devs, 1)

In [None]:
# How does the error change as we increase the number of trips? 
# It looks like the error relative to the standard deviation increases.
# I think we can safely say that trips between different users will not be strongly correlated.

n_standard_devs = []
n_trips_by_user = []
percent_error = []

no_sensed_proportion = []
walking_proportion = []

for u in energy_consumption_df.user_id.unique():
    
    user_df = energy_consumption_df[energy_consumption_df.user_id == u]

    totals_and_errors = get_EC.get_totals_and_errors(user_df, include_autocovariance= True)
    n_standard_devs.append(totals_and_errors['error_over_sd'])

    if totals_and_errors['autocov_sum'] < 0: 
        print(f"autocov sum, n_trips, error_over_sd: {totals_and_errors['autocov_sum']:.2f}, {len(user_df)}, {totals_and_errors['error_over_sd']:.3f}")
    n_trips_by_user.append(len(user_df))
    percent_error.append(totals_and_errors["percent_error_for_expected"])

    primary_mode_proportion = user_df.groupby('primary_mode').sum().distance/user_df.distance.sum()
    no_sensed_proportion.append(primary_mode_proportion['no_sensed'] if 'no_sensed' in primary_mode_proportion else 0)
    walking_proportion.append(primary_mode_proportion['walking'] if 'walking' in primary_mode_proportion else 0)

    if totals_and_errors['error_over_sd'] > 15:
        large_error_user_df = user_df.copy()
        large_error = totals_and_errors['error_over_sd']

plt.scatter(n_trips_by_user,n_standard_devs)
plt.xlabel("Number of trips for a user")
plt.ylabel("Number of standard deviations from truth")
plt.title("Energy consumption error relative to estimated standard deviation")

In [None]:
plt.hist(n_standard_devs, bins=50)

In [None]:
plt.scatter(walking_proportion,n_standard_devs)
fig = plt.gcf()
fig.set_size_inches(10,10)

In [None]:
# how does error over sd vary with no sensed and walking?
user_sd_info = pd.DataFrame({"no_sensed_proportion":no_sensed_proportion, "walking_proportion":walking_proportion, "n_trips":n_trips_by_user, "n_standard_devs":n_standard_devs})
enough_trips = user_sd_info.query('n_trips > 50').copy()
plt.scatter(enough_trips['no_sensed_proportion'],enough_trips['n_standard_devs'])
plt.scatter(enough_trips['walking_proportion'],enough_trips['n_standard_devs'])
fig = plt.gcf()
fig.set_size_inches(10,10)

In [None]:
enough_trips.hist("no_sensed_proportion")

In [None]:
from sklearn import linear_model

LR = linear_model.LinearRegression()
lr_fit = LR.fit(enough_trips[['no_sensed_proportion','walking_proportion']],enough_trips['n_standard_devs'])
# to do one at a time: LR.fit(np.array(enough_trips['no_sensed_proportion']).reshape(-1, 1),enough_trips['n_standard_devs']).coef_

lr_fit.coef_,lr_fit.intercept_

In [None]:
LR.fit(np.array(user_sd_info['n_trips']).reshape(-1, 1),user_sd_info['n_standard_devs']).coef_

In [None]:
# To generate n_sd_no_auto_cov_no_nonlinear, I ran compute_all_EC_values without the mixed_var_covar terms,
# and I ran get_totals_and_errors with include_autocovar = False.
before_after_n_sd = pd.DataFrame({"n_sd_no_auto_cov_no_nonlinear": n_standard_devs, "n_sd_with_auto_cov_and_nonlinear": user_sd_info['n_standard_devs']})
before_after_n_sd.plot()#.scatter(x = "n_sd_no_auto_cov_no_nonlinear",y = "n_sd_with_auto_cov_and_nonlinear")

In [None]:
# inspecting the modes used by a user with a large error over sd.
large_error_user_df.groupby('mode_confirm').sum()[['error_for_confusion', 'distance_miles']]

# up next: look at the shared rides for this user. why is shared ride error negative?. Do the mode distance prediction thing.
# Looks like a large percent of shared ride trips (and for other modes) for this user was no sensed and walk.
#user label: shared_ride
#    primary_mode
#    car          0.044047
#    no_sensed    0.655563
#    walking      0.300390

# look at percent no sensed for all users?
large_error_user_df.primary_mode.hist()

In [None]:
def print_prediction_percentages(df):
    for mode in ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike']:
        #n_user_labels = all_trip_modes['mode_confirm'].count(mode)
        mode_df = df[df['mode_confirm'] == mode]
        section_mode_distance_dict_given_user_label = {}
        for i,ct in mode_df.iterrows():
            section_modes = ct['section_modes']

        print(mode)
        #print(mode_df.primary_mode.value_counts(normalize=True)) # prediction percentages by mode count

        print(mode_df.groupby('primary_mode').sum().distance/mode_df.distance.sum()) # prediction percentages by distance

print_prediction_percentages(user_df)

In [None]:
user_df.query('mode_confirm == "shared_ride"').primary_mode.hist()
large_error_user_df.query('mode_confirm == "shared_ride"').primary_mode.hist()

In [None]:
shared_ride_large_user_error_df = large_error_user_df[large_error_user_df['mode_confirm'] == 'shared_ride']
shared_ride_large_user_error_df.primary_mode.hist

In [None]:
get_EC.get_totals_and_errors(large_error_user_df,include_autocovariance=False)

In [None]:
ratios_for_user = hf.get_ratios_for_dataset(large_error_user_df)
r_for_dataset = ratios_for_user['r']
percent_error_expected = hf.relative_error(sum(large_error_user_df['expected']),sum(large_error_user_df['user_labeled']))*100
percent_error_predicted = hf.relative_error(sum(large_error_user_df['predicted']),sum(large_error_user_df['user_labeled']))*100
mean_EC_all_user_labeled = sum(large_error_user_df['user_labeled'])
output_path = "/Users/mallen2/OpenPATH_Data/Sensing_sensitivity_analysis/"+"vail"+"_"+"mode_error_share"+"/" # might not actually be a vail user
hf.plot_error_by_primary_mode(large_error_user_df,'large_error_user', r_for_dataset, r, percent_error_expected,percent_error_predicted, mean_EC_all_user_labeled, output_path)

In [None]:
n_samples = len(n_standard_devs) # also == the number of users
below_1 = sum(np.array(n_standard_devs) < 1)/n_samples
below_2 = sum(np.array(n_standard_devs) < 2)/n_samples
below_3 = sum(np.array(n_standard_devs) < 3)/n_samples
below_7 = sum(np.array(n_standard_devs) < 7)/n_samples

print(f"Proportion within:\n1 sd: {below_1:.4f}\n2 sd: {below_2:.4f}\n3 sd: {below_3:.4f}\n7 sd: {below_7:.4f}")

In [None]:
# What percent of primary mode predictions is correct?

main_mode_confirms = ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike','train','taxi','free_shuttle', 'not_a_trip']
main_modes_df = expanded_labeled_trips[expanded_labeled_trips.mode_confirm.isin(main_mode_confirms)].copy()
main_modes_df = main_modes_df[main_modes_df.mode_confirm.notna()]

match_count = 0
for _,ct in main_modes_df.iterrows():
    if (ct['primary_mode'] == 'car') and (ct['mode_confirm'] in ['shared_ride', 'taxi']):
        match_count += 1
    elif (ct['primary_mode'] == 'bicycling') and (ct['mode_confirm'] == 'pilot_ebike'):
        match_count += 1
    elif (ct['primary_mode'] == 'bus') and (ct['mode_confirm'] == 'free_shuttle'):
        match_count += 1
    elif MODE_MAPPING_DICT[ct['primary_mode']] == MODE_MAPPING_DICT[ct['mode_confirm']]:
        match_count += 1

# The version below doesn't count a car prediction as correct for shared ride.
#sum(main_modes_df.mode_confirm.map(MODE_MAPPING_DICT)== main_modes_df.primary_mode.map(MODE_MAPPING_DICT))/len(main_modes_df)

print(match_count/len(main_modes_df)*100)  # 65.75% if we exclude not_a_trip, 63.50% if we include not_a_trip

In [None]:
# What fraction of the distance are we correctly predicting?

# Note: MODE_MAPPING_DICT["no_sensed"] == MODE_MAPPING_DICT["not_a_trip"]   # both give 'Not a Trip'

match_distance = 0
for _,ct in main_modes_df.iterrows():
    if len(ct['section_modes']) == 0:
        print(f"No sections sensed for a {ct['mode_confirm']} trip.")
    for i,s in enumerate(ct['section_modes']):
        if (s == 'car') and (ct['mode_confirm'] in ['shared_ride', 'taxi']):
            match_distance += ct['section_distances'][i]
        elif (s == 'bicycling') and (ct['mode_confirm'] == 'pilot_ebike'):
            match_distance += ct['section_distances'][i]
        elif (s == 'bus') and (ct['mode_confirm'] == 'free_shuttle'):
            match_count += 1
        elif MODE_MAPPING_DICT[s] == MODE_MAPPING_DICT[ct['mode_confirm']]:
            match_distance += ct['section_distances'][i]


print(100*match_distance/main_modes_df.distance.sum()) 

In [None]:
# Make the same plot but this time select random samples of trips rather than all trips from the same user.
# What do these show? They show that the relationship between number of standard deviations and the number of trips becomes less prominent when trips sets are not split by user.
# This suggests that there is more dependence between trip energy consumptions for the same user than for trips taken by different users.
def plot_n_trips_vs_fraction_of_error(energy_consumption_df, random_state_for_sampling):

    n_standard_devs = []
    n_trips_by_user = []
    percent_error = []

    for u in energy_consumption_df.user_id.unique():
        n_trips = len(energy_consumption_df[energy_consumption_df.user_id == u])
        sub_df = energy_consumption_df.sample(n_trips, random_state=random_state_for_sampling)
        totals_and_errors = get_EC.get_totals_and_errors(sub_df, include_autocovariance=True)
        n_standard_devs.append(totals_and_errors['error_over_sd'])
        n_trips_by_user.append(n_trips)
        percent_error.append(totals_and_errors["percent_error_for_expected"])

    plt.figure()
    plt.scatter(n_trips_by_user,n_standard_devs)
    plt.xlabel("Number of trips in random set of trips")
    plt.ylabel("Number of standard deviations from truth")
    plt.title(f"Energy consumption error relative to estimated standard deviation, random_state = {random_state_for_sampling}")

for j in range(1,5):
    plot_n_trips_vs_fraction_of_error(energy_consumption_df,random_state_for_sampling=j)

In [None]:
# TODO: read up about autocorrelation

def acf(x, length=20):  # https://stackoverflow.com/questions/643699/how-can-i-use-numpy-correlate-to-do-autocorrelation
    return np.array([1]+[np.corrcoef(x[:-i], x[i:])[0,1]  \
        for i in range(1, length)])

for u in energy_consumption_df.user_id.unique()[0:6]:
    
    user_df = energy_consumption_df[energy_consumption_df.user_id == u]
    user_df = user_df.sort_values(by='end_ts', ascending=True)

    total_labeled, total_expected = sum(user_df.user_labeled), sum(user_df.expected)

    fig = plt.figure()
    fig.set_size_inches(20,4)
    plt.plot(user_df.end_ts,user_df.user_labeled)
    plt.plot(user_df.end_ts,user_df.expected)
    plt.legend(["user labeled EC", "expected EC"])
    plt.ylabel("Trip energy consumption (kWH)")
    plt.xlabel("Trip end timestamp")


In [None]:
# A histogram of trip energy consumption.
energy_consumption_df.user_labeled.hist(bins=50)

#### Look at the autocovariance at a few lags. Plots of info in autocov_df come after the cell below.

In [None]:
autocov_df = pd.DataFrame()
for u in energy_consumption_df.user_id.unique():
    
    user_df = energy_consumption_df[energy_consumption_df.user_id == u]
    user_df = user_df.sort_values(by='end_ts', ascending=True)
    if len(user_df)<2: 
        print(f"Skipping a user with {len(user_df)} trips.") 
        continue
    labeled_autocov_1, expected_autocov_1 = get_EC.lagged_auto_cov(user_df.user_labeled,1), get_EC.lagged_auto_cov(user_df.expected,1)
    labeled_autocov_2, expected_autocov_2 = get_EC.lagged_auto_cov(user_df.user_labeled,2), get_EC.lagged_auto_cov(user_df.expected,2)
    labeled_autocov_10, expected_autocov_10 = get_EC.lagged_auto_cov(user_df.user_labeled,10), get_EC.lagged_auto_cov(user_df.expected,10)

    autocov_df = autocov_df.append({ "user_id": u,
                                    "n_trips": len(user_df),
                                    "labeled_EC_autocov_lag_1": labeled_autocov_1, 
                                    "labeled_EC_autocov_lag_2":labeled_autocov_2,
                                    "expected_EC_autocov_lag_1": expected_autocov_1,
                                    "expected_EC_autocov_lag_2": expected_autocov_2,
                                    "expected_EC_autocov_lag_10": expected_autocov_10,
                                    "labeled_EC_autocov_lag_10": labeled_autocov_10
                                    }, ignore_index=True)


In [None]:
plt.figure()
plt.scatter(autocov_df.labeled_EC_autocov_lag_1,autocov_df.expected_EC_autocov_lag_1)

# with more recorded trips, our autocov estimate gets closer to the truth.
plt.figure()
plt.scatter(autocov_df.n_trips, autocov_df.labeled_EC_autocov_lag_1 - autocov_df.expected_EC_autocov_lag_1)
plt.xlabel("Number of trips")
plt.ylabel("Difference between expected EC autocov and user labeled EC autocov at lag = 1")

In [None]:
autocov_df.expected_EC_autocov_lag_1.describe()

In [None]:
autocov_df.expected_EC_autocov_lag_2.describe()

In [None]:
autocov_df[abs(autocov_df.expected_EC_autocov_lag_1) > 400]

In [None]:
# Find out what the trips are like for users with 5000% error. 
# Answer: those users have very few trips.
plt.scatter(percent_error,n_trips_by_user) 
plt.xlabel("percent error for expected for each user")
plt.ylabel("Number of trips associated with that user")

In [None]:
sd = np.sqrt(energy_consumption_df.confusion_var.sum())
print(sd)
print((expected - actual)/sd)
hf.relative_error(expected,actual)*100, hf.relative_error(predicted,actual)*100

In [None]:
program_df = energy_consumption_df[energy_consumption_df['program'] == '4c'].copy()

In [None]:
# MobilityNet estimate of P(predicted| actual)
collapsed_confusion_matrix = ios_confusion.copy()
prior_probs = [1/len(collapsed_confusion_matrix.index)]* len(collapsed_confusion_matrix.index) # later try p_car = 0.4, everthing else is (1-0.4)/(n_non_car)

p_predicted_given_actual = collapsed_confusion_matrix.divide(collapsed_confusion_matrix.sum(axis=1), axis='rows')

p_predicted_given_actual

In [None]:
cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(android_confusion,energy_dict)

#### Distribution of predictions given user labeled mode
May want to look at primary mode normalized by distance rather than value counts.

In [None]:
all_trip_modes = expanded_labeled_trips[['mode_confirm','section_modes','primary_mode','distance']].copy()

#for i,ct in all_trip_modes.iterrows():

for mode in ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike']:
    #n_user_labels = all_trip_modes['mode_confirm'].count(mode)
    mode_df = all_trip_modes[all_trip_modes['mode_confirm'] == mode]
    section_mode_distance_dict_given_user_label = {}
    for i,ct in mode_df.iterrows():
        section_modes = ct['section_modes']

    print(mode)
    #print(mode_df.primary_mode.value_counts(normalize=True)) # prediction percentages by mode count

    print(mode_df.groupby('primary_mode').sum().distance/mode_df.distance.sum()) # prediction percentages by distance

### Errors for drove alone and shared ride

In [None]:
program_df = energy_consumption_df[energy_consumption_df['program'] == '4c'].copy()
drove_alone_4c_df = program_df[program_df['mode_confirm'] == 'drove_alone']
drove_alone_outliers = hf.get_outliers(drove_alone_4c_df,'error_for_confusion',100,15)[['distance','mode_confirm','section_modes','section_distances','primary_mode','primary_length','error_for_confusion','error_for_prediction','expected','predicted', 'user_labeled','os']]
#drove_alone_outliers

In [None]:
shared_ride_4c_df = program_df[program_df['mode_confirm'] == 'shared_ride']
shared_ride_outliers_low = hf.get_outliers(shared_ride_4c_df,'error_for_confusion',100,15)[['distance','distance_miles','mode_confirm','section_modes','section_distances','primary_mode','primary_length','error_for_confusion','error_for_prediction','expected','predicted', 'user_labeled','os']]
shared_ride_outliers_high = hf.get_outliers(shared_ride_4c_df,'error_for_confusion',85,0)[['distance','distance_miles','mode_confirm','section_modes','section_distances','primary_mode','primary_length','error_for_confusion','error_for_prediction','expected','predicted', 'user_labeled','os']]

fig,axs = plt.subplots(1,2)
fig.set_figwidth(15)
shared_ride_outliers_high.primary_mode.hist(ax = axs[0])
shared_ride_outliers_low.primary_mode.hist(ax = axs[1])

axs[0].set_title("4c shared ride overestimates primary modes (above 85th percentile)")
axs[1].set_title("4c shared ride underestimates primary modes(below 15th percentile)")

# most of the overestimates are car. (blue)
# most of the unerestimates are walking, bicycling, and no sensed.

In [None]:
shared_ride_outliers_high.distance_miles.hist()

In [None]:
drove_alone_outliers.primary_mode.hist()
plt.title("4c drove alone outlier primary modes (below the 15th percentile)")

In [None]:
# Naming convenction below: <user label>_<primary mode>
drove_alone_car = drove_alone_4c_df[drove_alone_4c_df.primary_mode == 'car']
shared_ride_car = shared_ride_4c_df[shared_ride_4c_df.primary_mode == 'car']

EI_used_for_android_sensed_car = 1.189540
EI_used_for_android_walking = 0.010464
EI_for_drove_alone = 1.51517707
EI_for_shared_ride = 0.757588535
drove_alone_car_distance = drove_alone_car.distance.sum()*METERS_TO_MILES
shared_ride_car_distance = shared_ride_car.distance.sum()*METERS_TO_MILES

# the outliers below 15% account for -2390 kWH
drove_alone_outlier_error = drove_alone_outliers.error_for_confusion.sum()
shared_ride_outliers_high_error = shared_ride_outliers_high.error_for_confusion.sum()
shared_ride_outliers_low_error = shared_ride_outliers_low.error_for_confusion.sum()

# the drove alone trips in 4c where the primary mode is car account for -1754 kWH of error.
print(f"Errors for drove alone and shared ride when we predict car: {drove_alone_car.error_for_confusion.sum():.2f}, {shared_ride_car.error_for_confusion.sum():.2f}")
print(f"Drove alone outlier errors sum: {drove_alone_outlier_error:.2f}")
print(f"Shared ride outlier error for upper outliers, lower outliers: {shared_ride_outliers_high_error:.2f}, {shared_ride_outliers_low_error:.2f}")

print("\nMost of the outlier error for drove alone is from walking.")
print(f"Difference between sensed walking and drove alone EI: {EI_used_for_android_walking - EI_for_drove_alone:.4f}")

print("\nMost of the overestimation outlier error for shared ride is from sensed car.")
print(f"Difference between sensed car and shared ride EI: {EI_used_for_android_sensed_car - EI_for_shared_ride:.4f}")

print("\nMost of the underestimation outlier error for shared ride is from no_sensed and walking.")
print(f"Difference between no_sensed and shared ride EI: {android_EI_moments_df['mean(EI)']['no_sensed'] - EI_for_shared_ride:.4f}")
print(f"Difference between sensed walking and shared ride EI: {EI_used_for_android_walking - EI_for_shared_ride:.4f}")
print("In either case, when we mispredict drove alone, we are guaranteed to have a higher error than for a similar shared ride trip.")


### Energy consumption estimates by user labeled mode

In [None]:
hf.plot_energy_consumption_by_mode(energy_consumption_df, program_name= 'all')

### Error magnitudes compared to standard deviations

In [None]:
# How many standard deviations does it take to reach the size of the error?
main_mode_confirms = ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike','train','taxi','free_shuttle']
energy_consumption_df['n_standard_devs'] = energy_consumption_df['error_for_confusion']/energy_consumption_df['confusion_sd']

# get standard deviation outliers?
energy_consumption_df.n_standard_devs.hist(bins=100)

pc_df = energy_consumption_df[energy_consumption_df['program']=='pc']

In [None]:
high_errors = energy_consumption_df[abs(energy_consumption_df.n_standard_devs) > 2]
high_errors.groupby('mode_confirm').sum().loc[main_mode_confirms].distance_miles

In [None]:
energy_consumption_df[energy_consumption_df.mode_confirm== 'shared_ride'].n_standard_devs.hist(bins=100)

In [None]:
energy_consumption_df[energy_consumption_df.mode_confirm== 'pilot_ebike'].n_standard_devs.hist(bins=100)

In [None]:
pc_df.n_standard_devs.plot(kind='barh')

In [None]:
energy_consumption_df.columns

In [None]:
variances_by_user_mode = energy_consumption_df.groupby("mode_confirm").sum().loc[main_mode_confirms][['confusion_var']]
variances_by_user_mode['sd'] = np.sqrt(variances_by_user_mode)
variances_by_user_mode

### Mode vs distance

In [None]:
%store -r energy_consumption_df
'not_a_trip' in energy_consumption_df.mode_confirm.unique()

In [None]:
# Compute the covariance between energy intensity and trip length.
expanded_labeled_trips['distance_miles'] = expanded_labeled_trips.distance*METERS_TO_MILES
expanded_labeled_trips['EI_friendly_mode'] = expanded_labeled_trips.mode_confirm.map(MODE_MAPPING_DICT)
expanded_labeled_trips['EI'] = expanded_labeled_trips.EI_friendly_mode.map(energy_dict)
no_nan = expanded_labeled_trips[expanded_labeled_trips['mode_confirm'].notna()]
np.cov(no_nan[['EI','distance_miles']].transpose())  # = 1105.462 for all CEO

In [None]:
np.sqrt(1742**2 + 54953*1105)*2.5

In [None]:
no_nan[['EI','distance_miles']].transpose()

In [None]:
expanded_labeled_trips.EI_friendly_mode.map(energy_dict)

In [None]:
expanded_labeled_trips['EI_friendly_mode'].unique()

In [None]:
expanded_labeled_trips.mode_confirm

In [None]:
# Make boxplots of distance and user labeled mode.
expanded_labeled_trips['distance_miles'] = expanded_labeled_trips.distance*METERS_TO_MILES

main_mode_confirms = ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike','train','taxi','free_shuttle', 'not_a_trip']
main_modes_only = expanded_labeled_trips[expanded_labeled_trips.mode_confirm.isin(main_mode_confirms)].copy()


fig,ax = plt.subplots(1,1)
fig.set_figwidth(15)
fig.set_figheight(10)
ax.set_ylabel("distance in miles")
ax.set_ylim([0,50])  # keep in mind that ceo has plenty of outliers above 50 and even above 300 miles
main_modes_only.boxplot(column='distance_miles', by='mode_confirm', ax=ax)

In [None]:
for mode in main_mode_confirms:
    mode_df = expanded_labeled_trips[expanded_labeled_trips.mode_confirm == mode]
    print(f"{mode}: {mode_df.distance_miles.median()}")

In [None]:
expanded_labeled_trips.distance.median()

### Modeling the energy consumption percent error as a function of dataset characteristics
Make sure you've calculated program proportions and energy consumption for the full dataset first.
Before analysis, keep track of whether you dropped not a trips in the "helper_functions.drop_unwanted_trips()" call. 

In [None]:
lasso = linear_model.Lasso(alpha=0.05)
linreg = linear_model.LinearRegression()
ridge = linear_model.Ridge(alpha= 0.05)

# splitting without shuffling leads to some larger car to other ratios
kf = KFold(n_splits=100, shuffle=True, random_state=2)  # some splits might not have any ebike
fold_proportions_and_errors = pd.DataFrame(columns=['r', 'drove_alone_distance', 'shared_ride_distance','car_proportion', 
                                        'ebike_proportion', 'walk_proportion', 'drove_alone_proportion', 'shared_ride_proportion', 'car_to_other',
                                        'percent_error','error'])


for train_index, test_index in kf.split(energy_consumption_df):
    # Calculate ratios
    # Calculate EC
    # Append to dataframe
    df_subset = energy_consumption_df.iloc[test_index]

    ratios = hf.get_ratios_for_dataset(df_subset)

    user_labeled = df_subset.user_labeled.sum()
    total_error = df_subset.error_for_confusion.sum()
    percent_error = 100*total_error/user_labeled

    ratios.update({'error': total_error, 'percent_error': percent_error})

    fold_proportions_and_errors = fold_proportions_and_errors.append(ratios, ignore_index=True)

features = ['r','non_moto_to_moto','car_to_other', 'walk_proportion', 'drove_alone_proportion']
X = fold_proportions_and_errors[features]
y = fold_proportions_and_errors['percent_error']

lin_reg_fit = linreg.fit(X,y)
print("Linear regression model coefficients:")
print({label:coef for label, coef in zip(features,lin_reg_fit.coef_)})

ridge_fit = ridge.fit(X,y)
print("Ridge regression model coefficients:")
print({label:coef for label, coef in zip(features,ridge_fit.coef_)})

lasso_fit = lasso.fit(X,y)
print("LASSO model coefficients:")
print({label:coef for label, coef in zip(features,lasso_fit.coef_)})

features_of_interest = ['r','non_moto_to_moto','car_to_other']
n_features = len(features_of_interest)
fig, axs = plt.subplots(nrows=n_features,ncols=1)
fig.set_figheight(5*n_features)
fig.set_figwidth(8)

j = 0
for feature in features_of_interest:
    axs[j].scatter(fold_proportions_and_errors[feature],fold_proportions_and_errors['percent_error'])
    axs[j].set_xlabel(feature)
    axs[j].set_ylabel("Percent error for expected")
    j+=1

In [None]:
# Display the predicted values and actual values for the non cross validated models produced above.
features_of_interest = ['r','non_moto_to_moto','car_to_other']
program_features = program_proportions[features]

# Calculate percent errors for each program.
program_percent_error_map = {}
for program in energy_consumption_df['program'].unique():
    program_df = energy_consumption_df[energy_consumption_df['program'] == program].copy()
    percent_error_expected = hf.relative_error(sum(program_df['expected']),sum(program_df['user_labeled']))*100
    program_percent_error_map[program] = percent_error_expected
program_percent_error_map['all'] = hf.relative_error(sum(energy_consumption_df['expected']),sum(energy_consumption_df['user_labeled']))*100

# Calculate the predictions for each program.
ridge_predictions = ridge_fit.predict(program_features)
lasso_predictions = lasso_fit.predict(program_features)
LR_predictions = lin_reg_fit.predict(program_features)

program_percent_error_list = [program_percent_error_map[x] for x in program_features.index]

error_model_df = pd.DataFrame({"program": program_features.index, "linear regression": LR_predictions, "ridge": ridge_predictions, "LASSO": lasso_predictions, 
            "observed percent error": program_percent_error_list,
            "linreg residuals": program_percent_error_list - LR_predictions,
            "ridge residuals": program_percent_error_list - ridge_predictions,
            "lasso residuals": program_percent_error_list - lasso_predictions})
print(error_model_df.round(3).to_markdown()) 

In [None]:
x_vals, y_vals = fold_proportions_and_errors['non_moto_to_moto'],fold_proportions_and_errors['percent_error']

a, b = np.polyfit(x_vals, y_vals, 1)

plt.scatter(x_vals, y_vals)

plt.plot(x_vals, a*x_vals+b)

In [None]:
# Try the naturalistic splits validation.
def build_percent_error_models(training_set_df, n_splits, features):
    # The test set in each KFold.split is the fold of interest.
    kf = KFold(n_splits, shuffle=True, random_state=2) 

    # This dataframe will store the proportions we calculate for each split of the training dataset.
    fold_proportions_and_errors = pd.DataFrame(columns=['r', 'drove_alone_distance', 'shared_ride_distance','car_proportion', 
                                        'ebike_proportion', 'walk_proportion', 'drove_alone_proportion', 'shared_ride_proportion', 'car_to_other',
                                        'percent_error','error'])

    for _, test_index in kf.split(training_set_df):
        # Calculate ratios.
        # Calculate EC percent error.
        # Append to dataframe
        df_subset = energy_consumption_df.iloc[test_index]

        ratios = hf.get_ratios_for_dataset(df_subset)

        user_labeled = df_subset.user_labeled.sum()
        total_error = df_subset.error_for_confusion.sum()
        percent_error = 100*total_error/user_labeled

        ratios.update({'error': total_error, 'percent_error': percent_error})

        fold_proportions_and_errors = fold_proportions_and_errors.append(ratios, ignore_index=True)

    X = fold_proportions_and_errors[features]
    y = fold_proportions_and_errors['percent_error']

    lin_reg_fit = linreg.fit(X,y)
    print("Linear regression model coefficients:")
    print({label:coef for label, coef in zip(features,lin_reg_fit.coef_)})

    ridge_fit = ridge.fit(X,y)
    print("Ridge regression model coefficients:")
    print({label:coef for label, coef in zip(features,ridge_fit.coef_)})

    lasso_fit = lasso.fit(X,y)
    print("LASSO model coefficients:")
    print({label:coef for label, coef in zip(features,lasso_fit.coef_)})

    return {"LR":lin_reg_fit, "Ridge": ridge_fit, "LASSO": lasso_fit}

#########
features_of_interest = ['r','non_moto_to_moto','car_to_other']
program_features = program_proportions[features_of_interest]

lasso = linear_model.Lasso(alpha=0.05)
linreg = linear_model.LinearRegression()
ridge = linear_model.Ridge(alpha= 0.05)

model_predictions_df = pd.DataFrame(columns=['program','LR ppe','Ridge ppe', 'LASSO ppe'])#, 'observed percent error'])

for program in energy_consumption_df.program.unique():

    # The training set is all data excluding the current program. 
    # I calculated relevant info from the program/test set in the cell that generates program_proportions. 
    training_set = energy_consumption_df[energy_consumption_df.program != program].copy()

    print(program)
    error_models = build_percent_error_models(training_set, n_splits=100, features= features_of_interest)

    # Find the appropriate index to look for within the model predicted values
    program_index = list(program_features.index).index(program)

    print(error_models["LR"].predict(program_features)[program_index])

    # To predict for 1 program only, could use np.dot(program_features.loc[program],error_models["LR"].coef_) + error_models["LR"].intercept_
    
    # Display the predictions
    model_predictions_df = model_predictions_df.append(
        {   "program": program,
            "LR ppe": error_models["LR"].predict(program_features)[program_index],  # need the prediction where the program == program,
            "Ridge ppe": error_models["Ridge"].predict(program_features)[program_index],
            "LASSO ppe": error_models["LASSO"].predict(program_features)[program_index]
        },
          ignore_index = True
    )

In [None]:
# Display the model predictions for each program after training on the rest of the data.

program_percent_error_map = {}
for program in energy_consumption_df['program'].unique():
    program_df = energy_consumption_df[energy_consumption_df['program'] == program].copy()
    percent_error_expected = hf.relative_error(sum(program_df['expected']),sum(program_df['user_labeled']))*100
    program_percent_error_map[program] = percent_error_expected
program_percent_error_map['all'] = hf.relative_error(sum(energy_consumption_df['expected']),sum(energy_consumption_df['user_labeled']))*100

model_predictions_df['observed percent error'] = model_predictions_df.program.map(program_percent_error_map)
model_predictions_df["linreg residuals"] =  model_predictions_df['observed percent error'] - model_predictions_df['LR ppe']
model_predictions_df["ridge residuals"] =  model_predictions_df['observed percent error'] - model_predictions_df['Ridge ppe']
model_predictions_df["LASSO residuals"] =  model_predictions_df['observed percent error'] - model_predictions_df['LASSO ppe']

print(model_predictions_df.round(3).to_markdown()) 

In [None]:
program_percent_error_map = {}
for program in energy_consumption_df['program'].unique():
    program_df = energy_consumption_df[energy_consumption_df['program'] == program].copy()
    percent_error_expected = hf.relative_error(sum(program_df['expected']),sum(program_df['user_labeled']))*100
    program_percent_error_map[program] = percent_error_expected
program_percent_error_map['all'] = hf.relative_error(sum(energy_consumption_df['expected']),sum(energy_consumption_df['user_labeled']))*100

program_percent_error_map

### Plot energy consumption by user labeled mode

In [None]:
program_df = energy_consumption_df[energy_consumption_df['program'] == '4c'].copy()

hf.plot_energy_consumption_by_mode(program_df,'4c')
hf.plot_energy_consumption_by_mode(energy_consumption_df,'all CEO')

In [None]:
program_df = energy_consumption_df[energy_consumption_df['program'] == '4c'].copy()
mode_df = program_df[program_df['mode_confirm'] == 'drove_alone']
mode_df.error_for_confusion.plot(kind="barh")

In [None]:
mode_df = program_df[program_df['mode_confirm'] == 'shared_ride']
mode_df.error_for_confusion.plot(kind="barh")

In [None]:
def plot_error_by_primary_mode(df,chosen_program, r_for_dataset, r, percent_error_expected,percent_error_predicted, mean_EC_all_user_labeled, output_path):
   # Plot error totals by mode:
    mode_expected_errors = {}
    mode_predicted_errors = {}

    for mode in df.primary_mode.unique():
        if type(mode) == float: continue
        user_labeled_total = sum(df[df.primary_mode == mode]['user_labeled'])
        error_for_expected = sum(df[df.primary_mode == mode]['expected']) - user_labeled_total
        error_for_predicted = sum(df[df.primary_mode == mode]['predicted']) - user_labeled_total

        mode_expected_errors[mode] = error_for_expected
        mode_predicted_errors[mode] = error_for_predicted

    mode_expected_errors['Total'] = sum(mode_expected_errors.values())
    mode_predicted_errors['Total'] = sum(mode_predicted_errors.values())
    all_modes = list(mode_expected_errors.keys())

    fig,axs = plt.subplots(1,2)
    fig.set_figwidth(15)
    fig.set_figheight(8)

    title = f"Total energy consumption errors by mode for {chosen_program}. Dataset r = {r_for_dataset:.2f}, used r = {r:.2f}, percent errors: expected: {percent_error_expected:.2f} predicted: {percent_error_predicted:.2f}\
    \nuser labeled EC: {mean_EC_all_user_labeled:.2f}"
    fig.suptitle(title)

    axs[0].grid(axis='x')
    axs[1].grid(axis='x')

    axs[0].barh(all_modes,[mode_expected_errors[x] for x in all_modes],height=0.5)
    axs[0].set_title("Confusion based error share by primary mode")
    axs[1].barh(all_modes,[mode_predicted_errors[x] for x in all_modes],height=0.5)
    axs[1].set_title("Prediction error share by primary mode")

    #fig_file = output_path+chosen_program+"_EC_mode_total_errors_"+which_car_precision+ "_for_car_precision_info"+ "_r_from_"+which_r+ "_" +remove_outliers + "_remove_outliers"+".png"

    fig_file = output_path+chosen_program+"_EC_primary_mode_total_errors_"+"Mobilitynet_precision"+"r_from_dataset"+"keep_outliers.png"
    fig.savefig(fig_file)
    plt.close(fig)

In [None]:
# plot the error share by primary mode for each program
for program in energy_consumption_df['program'].unique():
    program_df = energy_consumption_df[energy_consumption_df['program'] == program].copy()
    chosen_program = program
    r_for_dataset = program_proportions.loc[program]['r']
    percent_error_expected = hf.relative_error(sum(program_df['expected']),sum(program_df['user_labeled']))*100
    percent_error_predicted = hf.relative_error(sum(program_df['predicted']),sum(program_df['user_labeled']))*100
    mean_EC_all_user_labeled = sum(program_df['user_labeled'])
    output_path = "/Users/mallen2/OpenPATH_Data/Sensing_sensitivity_analysis/"+chosen_program+"_"+"mode_error_share"+"/"
    plot_error_by_primary_mode(program_df,chosen_program, r_for_dataset, r, percent_error_expected,percent_error_predicted, mean_EC_all_user_labeled, output_path)
    

In [None]:
# Make the primary mode error plot for the full dataset.
program_df = energy_consumption_df.copy()
chosen_program = 'all'
r_for_dataset = program_proportions.loc[chosen_program]['r']
percent_error_expected = hf.relative_error(sum(program_df['expected']),sum(program_df['user_labeled']))*100
percent_error_predicted = hf.relative_error(sum(program_df['predicted']),sum(program_df['user_labeled']))*100
mean_EC_all_user_labeled = sum(program_df['user_labeled'])
output_path = "/Users/mallen2/OpenPATH_Data/Sensing_sensitivity_analysis/"+chosen_program+"_"+"mode_error_share"+"/"
plot_error_by_primary_mode(program_df,chosen_program, r_for_dataset, r, percent_error_expected,percent_error_predicted, mean_EC_all_user_labeled, output_path)

In [None]:
# 1) get set of trips for which user mode = drove alone, primary mode = car for 4c
# 2) calculate error.
# 3) compare with the full error for user mode = drove alone.
program_df = expanded_labeled_trips[expanded_labeled_trips['program'] == '4c'].copy()
program_df = program_df.drop(
    program_df[program_df.mode_confirm == 'air'].index
    )

EC_4c = get_EC.compute_all_EC_values(program_df,unit_dist_MCS_df,energy_dict,android_EI_moments_df,ios_EI_moments_df)

for mode in ['shared_ride','drove_alone']:
    mode_df = EC_4c[EC_4c.mode_confirm == mode].copy()

    # Get the total error for all the trips for which we predicted car and the actual mode was <mode>
    error_for_expected = sum(mode_df[mode_df.primary_mode == 'car']['error_for_confusion'])
    print(mode,error_for_expected)

    # this implies that the large difference in error magnitude for 4c between shared ride and drove alone is not from load factor.
    # shared_ride 1491.5667796053235
    # drove_alone -1924.5901433665767

In [None]:
expanded_labeled_trips.mode_confirm.value_counts(normalize=True)

### Double checking that I calculated the energy consumption error for pc correctly

In [None]:
# df here is from expanded_labeled_trips.pickle, generated by place_all_trips_in_pkl.py
pc_trips = df[df['program'] == 'pc'].copy()
r = 1
car_EI_load_divider = (r+1)/(r+0.5)  # aka Michael's definition of load factor.
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_EI_load_divider})
pc_test_sample = pc_trips.sample(n = 5, random_state= np.random.RandomState(1))[['_id','distance','mode_confirm','section_modes','section_distances','os']]

# Get the confusion matrices and then the EI moments from those.
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')

r = 1  # 0.91 for vail, 0.71 for pc.
car_load_factor = (r+1)/(r+0.5)
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_load_factor})

android_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(android_confusion,energy_dict)
ios_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(ios_confusion,energy_dict)

# Test to see if you calculated expected EC error correctly.
pc_test_sample['EI'] = [energy_dict[MODE_MAPPING_DICT[mode]] for mode in pc_test_sample['mode_confirm']]
pc_test_sample['unit_length_mean'] = [unit_dist_MCS_df[os]["mean"] for os in pc_test_sample['os']]
#var_for_unit_L = unit_dist_MCS_df[os]["var"]
confusion_EC_list = []
for _,ct in pc_test_sample.iterrows():
    os = ct['os']
    modes = ct['section_modes']
    EC = 0
    # add each section EC to the EC for the trip.
    for i,mode in enumerate(modes):
        if os == "android":
            mean_EI = android_EI_moments_df["mean(EI)"][mode]
            var_EI = android_EI_moments_df["variance(EI)"][mode]  # variance given the inferred mode is <mode>
        elif os == "ios":
            mean_EI = ios_EI_moments_df["mean(EI)"][mode]
            var_EI = ios_EI_moments_df["variance(EI)"][mode]
        
        EC += mean_EI*ct['unit_length_mean']*ct['section_distances'][i]*METERS_TO_MILES
    
    confusion_EC_list.append(EC)


pc_test_sample['confusion_EC'] = confusion_EC_list

pc_test_sample['user_EC'] = pc_test_sample['EI']*pc_test_sample['unit_length_mean']*pc_test_sample['distance']*METERS_TO_MILES

total_expected, total_user_labeled = pc_test_sample['confusion_EC'].sum(), pc_test_sample['user_EC'].sum()

print(f"expected, user labeled, percent error: {total_expected, total_user_labeled, 100*hf.relative_error(total_expected,total_user_labeled)}")

# expected, user labeled, percent error: (25.386297055076234, 21.601249126657052, 17.52235672217788)

all_EC = get_EC.compute_all_EC_values(pc_test_sample,unit_dist_MCS_df,energy_dict,android_EI_moments_df,ios_EI_moments_df)
assert sum(all_EC['confusion_EC']) == total_expected
assert sum(all_EC['user_EC']) == total_user_labeled

In [None]:
''' Writes a docker exec command to mongodump a list of object ids for the selected collection'''
def write_mongodump_command(text_file_name, db_name, collection, object_id_list, dump_name):
    query_file = open(f"{text_file_name}","w")
    query_file.write(f"docker exec {db_name} sh -c 'mongodump --archive --db=Stage_database --collection={collection} --query=\"")
    query_file.write("{\\\"_id\\\":{\\\"\\$in\\\":[") 

    # when printed to the file, the query should look like this with an oid entry for each object_id_list element:
    #query="{\"_id\": {\"\$in\": [{\"\$oid\":\"<objectId>\"}]} }"' > db_testing.dump

    for k in range(0,len(object_id_list)):
        if k != len(object_id_list)-1:
            query_file.write(f"{{\\\"\\$oid\\\":\\\"{str(object_id_list[k])}\\\"}},")
        else:
            query_file.write(f"{{\\\"\\$oid\\\":\\\"{str(object_id_list[k])}\\\"}} ] }} }}\"\' > {dump_name}")
    query_file.close()
write_mongodump_command('pc_trip_query.txt', 'all-ceo-db', 'Stage_analysis_timeseries', list(pc_test_sample['_id']), 'pc_test_trips.dump')

# I would also need to dump the stage uuids into my test database
# #docker exec all-ceo-db sh -c 'mongodump --archive --db=Stage_database --collection=Stage_uuids --query="{}"'  > pc_Stage_uuids.dump

In [None]:
# Look at the percent error for car trips only.
only_car_trips = df[df['mode_confirm'].isin(['drove_alone','shared_ride'])].copy()
only_car_trips = hf.drop_unwanted_trips(only_car_trips) # double check whether you want to include not a trips.

android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')

# The following values were found by running sensing sensitivity analysis.py for each program and recording the percent error, 
# as seen in the github issue "Estimate mean and variance of energy consumption"
r_value_map = {"vail": 0.833, "pc": 0.730, "fc": 0.713, "cc": 0.591, "4c": 0.513, "sc": 0.566, "stage": 0.667}

r = 1  # 0.91 for vail, 0.71 for pc.
car_load_factor = (r+1)/(r+0.5)
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_load_factor})

android_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(android_confusion,energy_dict)
ios_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(ios_confusion,energy_dict)

percent_error_only_car = pd.DataFrame(columns= ['program','percent_error_for_expected','r_value'])
for program in only_car_trips['program'].unique(): # runs a while if you forget to specify unique
    program_df = only_car_trips[only_car_trips['program'] == program]
    df_with_EC = get_EC.compute_all_EC_values(program_df,unit_dist_MCS_df,energy_dict,android_EI_moments_df,ios_EI_moments_df)

    error_and_actual = df_with_EC.sum()[['error_for_confusion','user_labeled']]

    percent_error_only_car = percent_error_only_car.append({"program":program, 'percent_error_for_expected': error_and_actual['error_for_confusion']/error_and_actual['user_labeled'],
                                "r_value": r_value_map[program]}, ignore_index=True)

    #   'user_labeled', 'confusion_var', 'user_var']]
    #program_EC_map.loc[program] = df_with_EC.sum()[['error_for_confusion', 'error_for_prediction', 'expected', 'predicted',
    #   'user_labeled', 'confusion_var', 'user_var']]

In [None]:
# find percent of distance in ebike, after dropping not a trip.
no_not_a_trips = drop_unwanted_trips(df)
program_ebike_percent_map = {}
for program in no_not_a_trips['program'].unique():
    program_df = no_not_a_trips[no_not_a_trips['program'] == program]
    # get the ebike ratio
    ebike_distance = program_df.groupby('mode_confirm').sum()['distance']['pilot_ebike']
    total_distance = program_df.groupby('mode_confirm').sum()['distance'].sum()

    # store it for that program
    program_ebike_percent_map[program] = ebike_distance/total_distance

In [None]:
# plot percent error vs r and ebike ratio
# The following values were found by running sensing sensitivity analysis.py for each program and recording the percent error, 
# as seen in the github issue "Estimate mean and variance of energy consumption"
percent_error_for_expected_map_with_not_a_trip = {"vail": -12.86, "pc": 21.7, "fc": 11.77, "cc": 5.25, "4c": -6.92, "sc": 13.21, "stage": -1.13}
r_value_map = {"vail": 0.833, "pc": 0.730, "fc": 0.713, "cc": 0.591, "4c": 0.513, "sc": 0.566, "stage": 0.667}
percent_error_for_expected_map_without_not_a_trip = {"vail": -15.99, "pc": 11.14, "fc": 2.47, "cc": 4.22, "4c": -7.55, "sc": 12.29, "stage": -1.66}


ebike_list = [program_ebike_percent_map[program] for program in no_not_a_trips['program'].unique()]
r_list = [r_value_map[program] for program in no_not_a_trips['program'].unique()]
percent_error_list = [percent_error_for_expected_map_without_not_a_trip[program] for program in no_not_a_trips['program'].unique()]

fig = plt.figure()
fig.set_figwidth(10)
fig.set_figheight(10)
ax = fig.add_subplot(projection='3d')
ax.scatter(ebike_list,r_list,percent_error_list)
ax.set_xlabel("ratio of ebike distance to total distance")
ax.set_ylabel("r value")
ax.set_zlabel("percent error for expected")

ax.annotate()


#plt.zlabel("percent error for expected energy consumption, after dropping not a trip")

In [None]:
########## (With only car trips) Plot r value vs percent error for each program.

plt.scatter(percent_error_only_car['r_value'], percent_error_only_car['percent_error_for_expected'])
plt.xlabel("ratio of drove alone distance to shared ride distance")
plt.ylabel("percent error for expected energy consumption, with only car trips")

for _,p in percent_error_only_car.iterrows():
    plt.annotate(s = p['program'], xy = (p['r_value'] + 0.01,p['percent_error_for_expected']))

In [None]:
########## Plot r value vs percent error for each program.
# Also plot distance in shared ride vs percent error for each program dataset.
shared_ride_trips = expanded_labeled_trips[expanded_labeled_trips['mode_confirm'] == 'shared_ride'].copy()
program_shared_rides = shared_ride_trips.groupby('program').sum()

# The following values were found by running sensing sensitivity analysis.py for each program and recording the percent error, 
# as seen in the github issue "Estimate mean and variance of energy consumption"
percent_error_for_expected_map_with_not_a_trip = {"vail": -12.86, "pc": 21.7, "fc": 11.77, "cc": 5.25, "4c": -6.92, "sc": 13.21, "stage": -1.13}
r_value_map = {"vail": 0.833, "pc": 0.730, "fc": 0.713, "cc": 0.591, "4c": 0.513, "sc": 0.566, "stage": 0.667}
percent_error_for_expected_map_without_not_a_trip = {"vail": -15.99, "pc": 11.14, "fc": 2.47, "cc": 4.22, "4c": -7.55, "sc": 12.29, "stage": -1.66}


# 
program_shared_rides['percent_error_for_expected_without_not_a_trip'] = program_shared_rides.index.map(percent_error_for_expected_map_without_not_a_trip)
program_shared_rides['r_value'] = program_shared_rides.index.map(r_value_map)

plt.scatter(program_shared_rides['r_value'], program_shared_rides['percent_error_for_expected_without_not_a_trip'])
plt.xlabel("ratio of drove alone distance to shared ride distance")
plt.ylabel("percent error for expected energy consumption, after dropping not a trip")

for _,p in program_shared_rides.iterrows():
    plt.annotate(s = p.name, xy = (p['r_value'] + 0.01,p['percent_error_for_expected_without_not_a_trip']-1))

In [None]:
'''program_shared_rides['percent_error_for_expected'] = program_shared_rides.index.map(percent_error_for_expected_map_with_not_a_trip)
program_shared_rides['r_value'] = program_shared_rides.index.map(r_value_map)

plt.scatter(program_shared_rides['r_value'], program_shared_rides['percent_error_for_expected'])
plt.xlabel("ratio of drove alone distance to shared ride distance")
plt.ylabel("percent error for expected energy consumption")

for _,p in program_shared_rides.iterrows():
    plt.annotate(s = p.name, xy = (p['r_value'] + 0.01,p['percent_error_for_expected']-1))

plt.scatter(program_shared_rides['distance'], program_shared_rides['percent_error_for_expected'])
plt.xlabel("shared ride distance in dataset (m)")
plt.ylabel("percent error for expected energy consumption")

for _,p in program_shared_rides.iterrows():
    plt.annotate(s = p.name, xy = (p['distance']+10**6,p['percent_error_for_expected']-1))'''

In [None]:
# Add primary mode and length columns to expanded labeled trips
primary_modes = []
primary_lengths = []

for i,ct in expanded_labeled_trips.iterrows():
    # Get primary mode
    if len(ct["section_distances"]) == 0: # for data up to 5-9-2022, there are 63 stage trips with no sensed sections.
        expanded_labeled_trips = expanded_labeled_trips.drop(index = i) 
        print("dropped")
        continue
    longest_section = max(ct["section_distances"])
    primary_mode = ct["section_modes"][ct["section_distances"]==longest_section]

    # in case there are ever tied longest sections.
    # pick the most energy intensive mode.
    if isinstance(primary_mode,list): 
        mini_energy_dict = {x:energy_dict[MODE_MAPPING_DICT[x]] for x in primary_mode}
        primary_mode = max(mini_energy_dict, key=mini_energy_dict.get)

    primary_modes.append(primary_mode)
    primary_lengths.append(longest_section)

expanded_labeled_trips['primary_mode'] = primary_modes
expanded_labeled_trips['primary_length'] = primary_lengths

In [None]:
#################
#################
#################
# I changed this cell to not use the dataset car precision and use a drove alone divider of 1 
# so I could look at just drove alone/bike/bus/walk trips.

# for each trip, predict energy consumption with either the expectation or the prediction. compare it to the actual energy consumption.

#android_EI_moments_df = pd.read_csv("android_EI_moments_corrected_load.csv").set_index("mode")
#ios_EI_moments_df = pd.read_csv("ios_EI_moments_corrected_load.csv").set_index("mode")

print("Computing trip level energy consumptions")
new_car_precision = 0.83  # 0.739 for pc, 0.83 for vail.
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')

#new_android_cm = cm_handling.change_precision(android_confusion,'car',new_car_precision)
#new_ios_cm = cm_handling.change_precision(ios_confusion,'car',new_car_precision)

new_android_cm = android_confusion #cm_handling.drop_rows_and_columns(android_confusion,['Train','Pilot ebike','Scooter share'],['subway','train'])
new_ios_cm = ios_confusion #cm_handling.drop_rows_and_columns(ios_confusion,['Train','Pilot ebike','Scooter share'],['subway','train'])

r = 1  # 0.91 for vail, 0.71 for pc.
car_load_factor = (r+1)/(r+0.5)
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_load_factor})

android_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(new_android_cm,energy_dict)
ios_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(new_ios_cm,energy_dict)

expected = []
predicted = []
user_labeled = []

confusion_based_variance = []
user_based_variance = []

predicted_dict = {}
expected_dict = {}

expected_error_list = []
prediction_error_list = []

for i,ct in expanded_labeled_trips.iterrows():

    # Calculate expected energy consumption
    trip_expected, trip_confusion_based_variance = get_EC.get_expected_EC_for_one_trip(ct,unit_dist_MCS_df,android_EI_moments_df,ios_EI_moments_df)

    # Calculate predicted energy consumption
    trip_predicted = get_EC.get_predicted_EC_for_one_trip(ct,unit_dist_MCS_df,energy_dict)[0]
    
    # Calculate user labeled energy consumption
    trip_user_labeled, trip_user_based_variance = get_EC.get_user_labeled_EC_for_one_trip(ct,unit_dist_MCS_df,energy_dict)

    expected.append(trip_expected)
    predicted.append(trip_predicted)
    user_labeled.append(trip_user_labeled)

    confusion_based_variance.append(trip_confusion_based_variance)
    user_based_variance.append(trip_user_based_variance)

    user_mode = ct['mode_confirm']
    if user_mode not in predicted_dict: predicted_dict[user_mode] = []
    if user_mode not in expected_dict: expected_dict[user_mode] = []

    prediction_error = trip_predicted - trip_user_labeled
    expected_error = trip_expected - trip_user_labeled

    expected_error_list.append(expected_error)
    prediction_error_list.append(prediction_error)

    if abs(expected_error) < 100: 

        predicted_dict[user_mode].append(prediction_error)
        expected_dict[user_mode].append(expected_error)
    else:
        print(f"Large EC error: EC user labeled, EC expected: {trip_user_labeled, trip_expected}")
        print(f"\tTrip info: mode_confirm,sensed,distance (mi): {ct['mode_confirm'],ct['section_modes'],ct['distance']*METERS_TO_MILES}")

def relative_error(m,t):
    return (m-t)/t
total_expected = sum(expected)
total_predicted = sum(predicted)
total_user_labeled = sum(user_labeled)
print(f"Total EC: expected, predicted, user labeled {total_expected:.2f},{total_predicted:.2f},{total_user_labeled:.2f}")
print(f"standard deviation for expected: {np.sqrt(sum(confusion_based_variance)):.2f}")
print(f"Percent error: {relative_error(sum(expected),sum(user_labeled))*100:.3f}")


In [None]:
elt_with_errors = get_EC.compute_all_EC_values(expanded_labeled_trips,unit_dist_MCS_df,energy_dict,android_EI_moments_df,ios_EI_moments_df)

In [None]:
# Append the values to expanded_labeled_trips

quantiles = [99.9,0.1]
upper, lower = np.percentile(elt_with_errors['error_for_confusion'], quantiles)
print(f"{quantiles[0]} and {quantiles[1]} percentiles: {upper,lower}")

# | (elt_with_errors.errors_from_confusion > upper)
xlow_outliers = elt_with_errors[(elt_with_errors.error_for_confusion < lower) | (elt_with_errors.error_for_confusion > upper)]
xlow_outliers[['mode_confirm','distance','error_for_confusion','primary_mode','section_modes','section_distances']]
#[["mode_confirm","distance","section_modes","section_distances","errors_from_confusion","errors_from_prediction","confusion_sd"]]

# Drop outliers below the 0.1 percentile.
elt_with_errors_outliers_removed = elt_with_errors.drop(xlow_outliers.index)

In [None]:
def plot_error_by_mode(df,chosen_program):
   # Plot error totals by mode:
    mode_expected_errors = {}
    mode_predicted_errors = {}

    for mode in df.mode_confirm.unique():
        if mode == 'combination_football game, dinner, drove friend home': continue

        mode_expected_errors[mode] = df[df.mode_confirm == mode]['error_for_confusion'].sum()
        mode_predicted_errors[mode] = df[df.mode_confirm == mode]['error_for_prediction'].sum()

    mode_expected_errors['Total'] = sum(mode_expected_errors.values())
    mode_predicted_errors['Total'] = sum(mode_expected_errors.values())
    all_modes = list(mode_expected_errors.keys())

    fig,axs = plt.subplots(1,2)
    fig.set_figwidth(20)
    fig.set_figheight(int(len(all_modes)/4) + 1)
    fig.suptitle(f"Total energy consumption errors by mode for {chosen_program}")

    axs[0].grid(axis='x')
    axs[1].grid(axis='x')

    mode_expected_error_list = [mode_expected_errors[x] for x in all_modes]
    mode_predicted_error_list = [mode_predicted_errors[x] for x in all_modes]
    axs[0].barh(all_modes,mode_expected_error_list)

    for i, v in enumerate(mode_expected_error_list):
        axs[0].text(-np.sign(v)*500, i + 0.5, f"{v:.2f}", color='blue', fontweight='bold')

    axs[0].set_title("Confusion based error share by mode")
    axs[1].barh(all_modes,mode_predicted_error_list)

    for i, v in enumerate(mode_expected_error_list):
        axs[1].text(-np.sign(v)*50, i, f"{v:.2f}", color='red', fontweight='bold')

    axs[1].set_title("Prediction error share by mode")

plot_error_by_mode(elt_with_errors, 'vail')

In [None]:
elt_with_errors_outliers_removed['isfloat'] = elt_with_errors_outliers_removed['mode_confirm'].map(lambda x: type(x) == float)
nan_mode_confirms = elt_with_errors_outliers_removed[elt_with_errors_outliers_removed.isfloat == True]
df = nan_mode_confirms[['mode_confirm','section_modes','distance']]

In [None]:
for i,ct in elt_with_errors_outliers_removed.iterrows():
    if type(ct['mode_confirm']) is float:
        elt_with_errors_outliers_removed.at[i,'mode_confirm'] = 'nan'
    if ct['mode_confirm'] == 'nan':
        print(ct[['mode_confirm','section_modes','distance','expected']])

### Sensitivity Analysis

In [None]:
# Calculate the mean and sd for all user labeled and for all sensed:
mean_EC_all_sensing = sum(elt_with_errors_outliers_removed['expected'])
mean_EC_all_user_labeled = sum(elt_with_errors_outliers_removed['user_labeled'])

sd_sensed = np.sqrt(sum(elt_with_errors_outliers_removed['confusion_var']))
sd_users = np.sqrt(sum(elt_with_errors_outliers_removed['user_var']))

# Now calculate for various random splits of the data
# 10^3 NMC takes 10 seconds on vail to create all 4 splits.
proportion_sensed = [0.2,0.4,0.6,0.8]
NMC = 100#**2#**3

summary_df_map = {}
for ps in proportion_sensed:
    
    mean_EC_agg = []
    var_EC_agg = []
    error_EC_agg = []
    for j in range(0,NMC):
        rand_state = np.random.RandomState(1+j)

        # Split the labeled trips into a user labeled dataframe and a sensed dataframe
        user_labeled,sensed  = skm.train_test_split(elt_with_errors_outliers_removed , 
                                                    test_size = ps, # sensed
                                                    train_size = 1-ps,  # user_labeled
                                                    random_state= rand_state)
        mean_EC_sensed, var_EC_sensed = sum(sensed['expected']), sum(sensed['confusion_var'])
        
        mean_EC_user_labeled, var_EC_user_labeled = sum(user_labeled['user_labeled']), sum(user_labeled['user_var'])

        # Get the total mean and variance for the current iteration and add it to a list.
        current_aggregate_EC = mean_EC_sensed + mean_EC_user_labeled
        mean_EC_agg.append(current_aggregate_EC)
        var_EC_agg.append(var_EC_sensed + var_EC_user_labeled)
        error_EC_agg.append(current_aggregate_EC - mean_EC_all_user_labeled)

        sd_EC_agg = np.sqrt(np.array(var_EC_agg))

    summary_df_map[ps] = pd.DataFrame({"mean": mean_EC_agg, "sd": sd_EC_agg, 'error': error_EC_agg})
 
        # prop var sensed
        # prop var user labeled
average_summaries = {}
for ps in proportion_sensed:
    average_across_splits_mean = np.mean(summary_df_map[ps]["mean"])
    average_across_splits_sd = np.mean(summary_df_map[ps]["sd"])
    average_summaries[ps] = {"mean": average_across_splits_mean, "sd": average_across_splits_sd}

def get_interval(mean,sd):
    return [mean -sd, mean,mean + sd]

interval_sensed_vail = get_interval(mean_EC_all_sensing,sd_sensed)
interval_users_vail = get_interval(mean_EC_all_user_labeled,sd_users)

In [None]:
fig,ax = plt.subplots()
fig.set_figheight(6)

print(f"Prop = {0}: mean, sd: {mean_EC_all_user_labeled,sd_users}")

ax.plot([0]*3,interval_users_vail,'bo') 
j = 1
for ps in proportion_sensed:
    summary = average_summaries[ps]

    print(f"Prop = {ps}: mean, sd: {summary['mean'] ,summary['sd']}")
    x = [ps]*3
    y = get_interval(summary["mean"],summary["sd"])
    ax.plot(x,y,'bo')
    j+=1

print(f"Prop = {1}: mean, sd: {mean_EC_all_sensing,sd_sensed}")
ax.plot([1]*3,interval_sensed_vail,'bo')
ax.set_ylim([7000,11000])#([7000,11000]) [40000,70000]
ax.set_xlabel("Proportion of trips using sensing as opposed to user labels")
ax.set_ylabel("Energy consumption (kWH)")

fig.suptitle("PC energy consumption mean +- 1 sd as percent of sensed trips increases")

In [None]:
# How often is the magnitude of the aggregate error less than z standard deviations?
z = 2
for ps in proportion_sensed:
    ps0x = summary_df_map[ps]   # proportion sensed = 0.x
    print(f"proportion sensed = {ps}: {sum(z*ps0x['sd'] > abs(ps0x['error']))/len(ps0x)}")

In [None]:
elt_with_errors.error_for_confusion.describe()

In [None]:
quantiles = [90,5]
upper, lower = np.percentile(expected_error_list, quantiles)
print(f"{quantiles[0]} and {quantiles[1]} percentiles: {upper,lower}")

# | (elt_with_errors.errors_from_confusion > upper)
low_outliers = elt_with_errors[(elt_with_errors.error_for_confusion < lower)]
# [["mode_confirm","distance","section_modes","section_distances","errors_from_confusion"]]

low_outliers.mode_confirm.value_counts()

In [None]:
# Overall errors when we have no outliers.
remove_low = elt_with_errors.drop(xlow_outliers.index)
summary = remove_low.sum()[['expected','predicted','user_labeled','confusion_sd']]
percent_error = (summary['expected'] - summary['user_labeled'])/summary['user_labeled']

totals = remove_low.sum()[['expected','predicted','user_labeled']]

print(totals)
print("\nDifferences (look at the \"user_labeled\" value and compare with sd):")
print(totals.diff())
print(f"sd: {np.sqrt(sum(remove_low['confusion_sd']**2))}")
print(f"percent error between expected and user labeled: {percent_error}")

In [None]:
mode_grouped_values = remove_low.groupby('mode_confirm').sum()
drove_alone = mode_grouped_values.loc['drove_alone'][['expected','predicted','user_labeled','error_for_prediction','error_for_confusion']]
shared_ride = mode_grouped_values.loc['shared_ride'][['expected','predicted','user_labeled','error_for_prediction','error_for_confusion']]
print('Drove alone:')
print(drove_alone)
print('\n')
print('Shared ride:')
print(shared_ride)
print('\nSum:')
print(drove_alone + shared_ride)

In [None]:
remove_low[['mode_confirm','section_modes','expected','predicted','user_labeled','error_for_prediction','error_for_confusion','confusion_sd','os','distance']]

In [None]:
# EC errors
program = 'vail'
def plot_error_hists_by_mode(df):
    n_plots = len(df.mode_confirm.unique())
    fig,axs = plt.subplots(n_plots,2)
    fig.set_figwidth(15)
    fig.set_figheight(4*n_plots)
    fig.suptitle(f"{program}")
    i = 0

    for mode in df.mode_confirm.unique():
        if mode == 'combination_football game, dinner, drove friend home': continue

        mode_expected_error = df[df.mode_confirm == mode]['error_for_confusion']
        mode_prediction_error = df[df.mode_confirm == mode]['error_for_prediction']


        if type(mode) == float: mode = 'nan'
        axs[i,0].hist(mode_expected_error,bins=30)
        axs[i,0].set_xlabel(mode + ' EC confusion based error')

        axs[i,1].hist(mode_prediction_error,bins=30)
        axs[i,1].set_xlabel(mode + ' EC prediction based error')
        i+=1

plot_error_hists_by_mode(remove_low);

In [None]:
remove_low.mode_confirm.unique()

In [None]:
np.histogram(remove_low['error_for_confusion'],bins=30)

In [None]:
os = []
for _,ct in remove_low.iterrows():
    u = ecwu.User(ct.user_id)
    os.append(u.getProfile()["curr_platform"])

In [None]:
remove_low['os'] = os

In [None]:
mean_EC_increase_if_primary_car = 0
distance_sum = 0

primary_mode_EC = 0
for _,ct in remove_low.iterrows():

    # Look only at the trips where primary mode was car and mode confirm was drove alone.
    # ignore the two outlier drove alone trips below the 0.1 percentile.
    if  (ct['primary_mode'] == 'car') & (ct['mode_confirm']=='drove_alone'):
        # Get operating system
        u = ecwu.User(ct.user_id)
        os = u.getProfile()["curr_platform"]

        # Get OS specific trip length info.
        mean_for_unit_L = unit_dist_MCS_df[os]["mean"]

        section_modes = ct["section_modes"]
        n_sections = len(section_modes)
        sections_lengths = np.array(ct["section_distances"])*METERS_TO_MILES   # 1 meter = 0.000621371 miles

        mean_L = sections_lengths*mean_for_unit_L
            
        non_car_dist = 0
        for s in range(0,n_sections):
            # EI mean and variance.

            # Add up the non car section energy consumptions as if they were car.
            if section_modes[s]== 'car': continue
            non_car_dist += mean_L[s]

        distance_sum += non_car_dist

        mean_EI, var_EI = get_EC.get_EI_moments_for_trip('car',os,android_EI_moments_df,ios_EI_moments_df)
        # Propagate variance for the trip
        mean_EC = non_car_dist*mean_EI

        primary_mode_EC += sum(mean_L)*mean_EI

        # Add to total - follows from assumed independence of section errors.
        mean_EC_increase_if_primary_car += mean_EC

    

mean_EC_increase_if_primary_car, primary_mode_EC, distance_sum

In [None]:
primary_car = remove_low[remove_low['primary_mode'] == 'car']
primary_car_drove_alone = remove_low[remove_low['mode_confirm'] == 'drove_alone']

primary_car_drove_alone.groupby(primary_car_drove_alone.os).count()

In [None]:
remove_low['mode_confirm'].value_counts()

In [None]:
elt_with_errors.mode_confirm.unique()

In [None]:
# Find the distance in each mode.
mode_distance_df = remove_low[['mode_confirm','distance']].groupby("mode_confirm").sum()
mode_distance_df.loc['car'] + mode_distance_df.loc['drove_alone'] - mode_distance_df.loc["shared_ride"]

# More distance in shared ride than in drove alone!

#### Load factor estimate

#### Find how our estimate changes when we have a different assumed ratio of drove alone to shared ride

In [None]:
x = 1 # drove_alone_load_factor
y = 2 # shared_ride_load_factor
r = 0.91

# VMT
v_miles_in_drove_alone = drove_alone_dist*METERS_TO_MILES*1.04 #9000 #   # drove alone mean distance in miles.  
v_miles_in_shared_ride = shared_ride_dist*METERS_TO_MILES*1.04 #11000 #   # shared ride mean distance in miles.   # 100 -> off by 20, 1000 -> off by 200
avg = (x+y)/2#(r+1)/(r+0.5)#(x+y)/2

drove_alone_EI = 1.51517707
shared_ride_EI = 0.757588535

no_average = 1/x*drove_alone_EI*v_miles_in_drove_alone + 1/y*drove_alone_EI*v_miles_in_shared_ride
with_average = 1/avg*drove_alone_EI*v_miles_in_drove_alone + 1/avg*drove_alone_EI*v_miles_in_shared_ride


# without average means that we use the correct energy intensity of each mode. For "with average", we use the average value for both.
print(f"drove_alone without average, with average: {1/x*drove_alone_EI*v_miles_in_drove_alone,1/avg*drove_alone_EI*v_miles_in_drove_alone}")   # with avg underestimates
print(f"shared_ride without average, with average: {1/y*drove_alone_EI*v_miles_in_shared_ride,1/avg*drove_alone_EI*v_miles_in_shared_ride}")   # with avg overestimates
print(no_average-with_average)

#### Find the precision for car based on user labels in the current dataset.

In [None]:
# A look at what the car primary mode trips are like.
sensed_mode_dict = {}
car_user_sensing_match = 0

car_walking_cases = []
car_trip_n_sections = []
car_biking_cases = []

walks = []
cars = 0

multi_section_cars = []

# get all cases where car is not the only section.

for _,ct in expanded_labeled_trips.iterrows():
    longest_section = max(ct["section_distances"])
    primary_mode = ct["section_modes"][ct["section_distances"]==longest_section]

    # in case there are ever tied longest sections.
    # pick the most energy intensive mode.
    if isinstance(primary_mode,list): 
        mini_energy_dict = {x:energy_dict[MODE_MAPPING_DICT[x]] for x in primary_mode}
        primary_mode = max(mini_energy_dict, key=mini_energy_dict.get)

    if primary_mode == 'car': cars += 1

    if primary_mode == "car" and ct["mode_confirm"] in ["drove_alone","shared_ride","car"]:

        modes = ct["section_modes"]
        dists = ct["section_distances"]

        if len(modes) > 1:
            multi_section_cars.append(ct)

        if ct["section_modes"] == ['car', 'walking']:
            car_walking_cases.append(ct["section_distances"])
        if ct["section_modes"] == ['car', 'bicycling']:
            car_biking_cases.append(ct["section_distances"])
        elif ['car','walking'] in ct["section_modes"]:
            print(ct["section_modes"])

        #print(ct["section_modes"])
        #print(ct["section_distances"])
        car_user_sensing_match +=1

        car_trip_n_sections.append(len(ct["section_distances"]))


    if primary_mode not in sensed_mode_dict: sensed_mode_dict[primary_mode] = 1
    sensed_mode_dict[primary_mode] +=1

# Calculate precision for car.
car_precision = car_user_sensing_match/cars  #P(userlabel = car| predict car) = P(predict and ground truth car)/P(predict car)
print(car_precision)   # 83% for vail, 73.9% for pueblo county

In [None]:
### Calculate car precision for this dataset.
car_user_sensing_match = 0
primary_cars = 0

for _,ct in expanded_labeled_trips.iterrows():
    longest_section = max(ct["section_distances"])
    primary_mode = ct["section_modes"][ct["section_distances"]==longest_section]

    # in case there are ever tied longest sections.
    # pick the most energy intensive mode.
    if isinstance(primary_mode,list): 
        mini_energy_dict = {x:energy_dict[MODE_MAPPING_DICT[x]] for x in primary_mode}
        primary_mode = max(mini_energy_dict, key=mini_energy_dict.get)

    if primary_mode == 'car': primary_cars += 1

    if primary_mode == "car" and ct["mode_confirm"] in ["drove_alone","shared_ride","car"]:
        car_user_sensing_match +=1

# Calculate precision for car.
car_precision = car_user_sensing_match/primary_cars  #P(userlabel = car| predict car) = P(predict and ground truth car)/P(predict car)
print(car_precision)   # 83% for vail, 73.9% for pueblo county

In [None]:
car_precision

In [None]:
user_label_to_sensing_map = {
    'drove_alone': 'car',
    'shared_ride': 'car',
    'pilot_ebike': 'bicycling',
    'walk': 'walking',
    'bus': 'bus',
    'not_a_trip': 'no_sensed',
    'car': 'car',
    'taxi': 'car',
    'bike': 'bicycling',
    'train': 'train',
    'subway': 'subway',
    'air': 'air_or_hsr'
}

gis_sensed_modes = {0 : 'no_sensed',    # UNKNOWN  #NOTE: this is important info to mention.
        1 : 'walking',    # WALKING
        2 : 'bicycling',    # BICYCLING
        3 : 'bus',        # BUS
        4 : 'train',      # TRAIN
        5 : 'car',        # CAR
        6 : 'air_or_hsr', # AIR_OR_HSR
        7 : 'subway',      # SUBWAY
        8 : 'train',      # TRAM
        9 : 'train',      # LIGHT_RAIL
}

### Try to find the confusion matrix for the current dataset

In [None]:
import sklearn.metrics as sklearn_metrics

confusion_ready = expanded_labeled_trips.dropna()
user_labels = confusion_ready['mode_confirm']
sensed_labels = confusion_ready['primary_mode']
cm_user_v_sensed = sklearn_metrics.confusion_matrix(user_labels,sensed_labels)

In [None]:
cm_user_v_sensed.shape

In [None]:
mc_types = [type(x) for x in expanded_labeled_trips['mode_confirm']]
type(confusion_ready['mode_confirm'].iloc[mc_types.index(float)])

# so mode confirm has an nan.

In [None]:
sensed_user = expanded_labeled_trips[['primary_mode','mode_confirm']]
sensed_user[sensed_user['mode_confirm']== 'electric_vehicle']  # sensed as walking!

In [None]:
expanded_labeled_trips['mode_confirm'].value_counts()

In [None]:
length_diffs = []
# Verify that section sums are equal to trip lengths.
for _,ct in expanded_labeled_trips.iterrows():
    section_sum = sum(ct["section_distances"])
    trip_length = ct["distance"]

    length_diffs.append(trip_length-section_sum)
    #np.testing.assert_approx_equal(section_sum,ct['distance'],5)
plt.hist(length_diffs)

In [None]:
primary_mode = ["walking","car"]
mini_energy_dict = {x:energy_dict[MODE_MAPPING_DICT[x]] for x in primary_mode}
primary_mode = max(mini_energy_dict, key=mini_energy_dict.get)

In [None]:
walk_sum = 0
for trip in car_walking_cases:
    print(trip)
    walk_sum += trip[1]
# This is the increase in EC if we count walk sections for primary car trips as car.
walk_sum*1.13*1.04*METERS_TO_MILES - walk_sum*0.01*1.04*METERS_TO_MILES

In [None]:
# In this cell: 
# The primary section is the only section used.
# I ignore shared rides.
# Car load factor to create the EI moments dataframe: 1
# Car precision is increased stepwise by 0.1, with bus taking the rest of the labels. (see store_errors.ipynb).
# In this analysis, the adjusted car precision is the same for both ios and android.

# drop shared rides.
elt = expanded_labeled_trips.copy()
no_shared_rides = elt.drop(elt[elt.mode_confirm == "shared_ride"].index)

prediction_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,True, unit_dist_MCS_df, 
        android_EI_moments_no_shared_ride_df, # these do not get used when we only use the prediction.
        ios_EI_moments_no_shared_ride_df, 
        gis_sensed_modes,energy_dict,
        using_predictions=True,
        only_primary_section=True)

for j in range(0,4):
    print(f"car precision: {0.45+ 0.1*(j+1)}")
    precision_adj_android_EI_moments_df = pd.read_csv("android_EI_moments_no_shared_ride_car_precision_adjustment_"+ str(j) + ".csv").set_index("mode")
    precision_adj_ios_EI_moments_df = pd.read_csv("ios_EI_moments_no_shared_ride_car_precision_adjustment_"+ str(j) + ".csv").set_index("mode")

    # Looking only at the primary section EC consumption and applying the user or sensed label to it.
    confusion_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,True, unit_dist_MCS_df, 
            precision_adj_android_EI_moments_df, 
            precision_adj_ios_EI_moments_df, 
            gis_sensed_modes,energy_dict,
            using_predictions=False,
            only_primary_section=True)

    print(f"prediction,confusion based: {prediction_EC_primary_section[0],confusion_EC_primary_section[0]}")

user_no_shared_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,False, unit_dist_MCS_df, 
        android_EI_moments_no_shared_ride_df, # these do not get used if looking at user labels.
        ios_EI_moments_no_shared_ride_df, 
        gis_sensed_modes,energy_dict,
        using_predictions=False,
        only_primary_section=True)
print(f"User EC: {user_no_shared_EC_primary_section[0]}")

In [None]:
# In this cell: 
# The primary mode is applied to the entire trip.
# I ignore shared rides.
# Car load factor to create the EI moments dataframe: 1
# Car precision is increased stepwise by 0.1, with bus taking the rest of the labels. (see store_errors.ipynb).
# In this analysis, the adjusted car precision is the same for both ios and android.

# Ensure that a predicted car trip is treated as drove alone
drove_alone_EI = energy_dict["Gas Car, drove alone"]
load_factor = 1
energy_dict.update({"Gas Car, sensed": drove_alone_EI/load_factor})

# drop shared rides.
elt = expanded_labeled_trips.copy()
no_shared_rides = elt.drop(elt[elt.mode_confirm == "shared_ride"].index)

prediction_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,True, unit_dist_MCS_df, 
        android_EI_moments_no_shared_ride_df, # these do not get used when we only use the prediction.
        ios_EI_moments_no_shared_ride_df, 
        gis_sensed_modes,energy_dict,
        using_predictions=True,
        only_primary_section=False)

for j in range(0,4):
    print(f"car precision: {0.45+ 0.1*(j+1)}")
    precision_adj_android_EI_moments_df = pd.read_csv("android_EI_moments_no_shared_ride_car_precision_adjustment_"+ str(j) + ".csv").set_index("mode")
    precision_adj_ios_EI_moments_df = pd.read_csv("ios_EI_moments_no_shared_ride_car_precision_adjustment_"+ str(j) + ".csv").set_index("mode")

    # Looking only at the primary section EC consumption and applying the user or sensed label to it.
    confusion_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,True, unit_dist_MCS_df, 
            precision_adj_android_EI_moments_df, 
            precision_adj_ios_EI_moments_df, 
            gis_sensed_modes,energy_dict,
            using_predictions=False,
            only_primary_section=False)

    print(f"prediction,confusion based: {prediction_EC_primary_section[0],confusion_EC_primary_section[0]}")

user_no_shared_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,False, unit_dist_MCS_df, 
        android_EI_moments_no_shared_ride_df, # these do not get used if looking at user labels.
        ios_EI_moments_no_shared_ride_df, 
        gis_sensed_modes,energy_dict,
        using_predictions=False,
        only_primary_section=False)
print(f"User EC: {user_no_shared_EC_primary_section[0]}")

In [None]:
# In this cell: 
# All sections are used.
# I ignore shared rides.
# Car load factor to create the EI moments dataframe: 1
# Car precision is increased stepwise by 0.1, with bus taking the rest of the labels. (see store_errors.ipynb).
# In this analysis, the adjusted car precision is the same for both ios and android.

# Ensure that a predicted car trip is treated as drove alone
drove_alone_EI = energy_dict["Gas Car, drove alone"]
load_factor = 1
energy_dict.update({"Gas Car, sensed": drove_alone_EI/load_factor})

# drop shared rides.
elt = expanded_labeled_trips.copy()
no_shared_rides = elt.drop(elt[elt.mode_confirm == "shared_ride"].index)

mean_EC_naive_no_shared, var_EC_naive_no_shared, avg_EI_no_shared= get_aggregate_EC_with_extras(no_shared_rides,True, unit_dist_MCS_df, 
        android_EI_moments_df, # not used in prediction based calculation.
        ios_EI_moments_df, 
        gis_sensed_modes,energy_dict,
        use_naive_sensing_prediction=True,
        car_load_factor = 1)

for j in range(0,4):
    print(f"car precision: {0.45+ 0.1*(j+1)}")
    precision_adj_android_EI_moments_df = pd.read_csv("android_EI_moments_no_shared_ride_car_precision_adjustment_"+ str(j) + ".csv").set_index("mode")
    precision_adj_ios_EI_moments_df = pd.read_csv("ios_EI_moments_no_shared_ride_car_precision_adjustment_"+ str(j) + ".csv").set_index("mode")


    mean_EC_no_shared_expected, var_EC_no_shared_expected = get_aggregate_EC(no_shared_rides,True, unit_dist_MCS_df,
            precision_adj_android_EI_moments_df, 
            precision_adj_ios_EI_moments_df, 
            gis_sensed_modes,energy_dict)

    print(f"prediction,confusion based: {mean_EC_naive_no_shared,mean_EC_no_shared_expected}")


mean_EC_no_shared_users,_ = get_aggregate_EC(no_shared_rides,False, unit_dist_MCS_df,
        precision_adj_android_EI_moments_df, 
        precision_adj_ios_EI_moments_df, 
        gis_sensed_modes,energy_dict)
print(f"User EC: {mean_EC_no_shared_users}")

In [None]:
# A look at the values for primary sections when we have no shared rides and the confusion matrices in Gabe's paper.
# ie load factor 1 for sensed car.
android_EI_moments_no_shared_ride_df = pd.read_csv("android_EI_moments_no_shared_ride.csv").set_index("mode")
ios_EI_moments_no_shared_ride_df = pd.read_csv("ios_EI_moments_no_shared_ride.csv").set_index("mode")

# drop shared rides.
elt = expanded_labeled_trips.copy()
no_shared_rides = elt.drop(elt[elt.mode_confirm == "shared_ride"].index)

prediction_no_shared_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,True, unit_dist_MCS_df, 
        android_EI_moments_no_shared_ride_df, 
        ios_EI_moments_no_shared_ride_df, 
        gis_sensed_modes,energy_dict,
        using_predictions=True,
        only_primary_section=True)

confusion_no_shared_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,True, unit_dist_MCS_df, 
        android_EI_moments_no_shared_ride_df, 
        ios_EI_moments_no_shared_ride_df, 
        gis_sensed_modes,energy_dict,
        using_predictions=False,
        only_primary_section=True)

user_no_shared_EC_primary_section = get_primary_mode_aggregate_EC(no_shared_rides,False, unit_dist_MCS_df, 
        android_EI_moments_no_shared_ride_df, 
        ios_EI_moments_no_shared_ride_df, 
        gis_sensed_modes,energy_dict,
        using_predictions=False,
        only_primary_section=True)

print("prediction, confusion EC for no shared rides and only primary sections")
prediction_no_shared_EC_primary_section[0],confusion_no_shared_EC_primary_section[0], user_no_shared_EC_primary_section[0]

In [None]:
android_EI_moments_no_shared_ride_df = pd.read_csv("android_EI_moments_no_shared_ride.csv").set_index("mode")
ios_EI_moments_no_shared_ride_df = pd.read_csv("ios_EI_moments_no_shared_ride.csv").set_index("mode")
android_EI_moments_no_shared_ride_df,ios_EI_moments_no_shared_ride_df

In [None]:
prediction_EC_primary_section[0],user_EC[0],confusion_EC_primary_section[0]

In [None]:
np.diff([prediction_EC_primary_section[0],user_EC[0],confusion_EC_primary_section[0]])

In [None]:
#prediction_EC_primary_trip = get_primary_mode_aggregate_EC(expanded_labeled_trips,True, unit_dist_MCS_df, android_EI_moments_df, ios_EI_moments_df, gis_sensed_modes,energy_dict,using_predictions=True,only_primary_section=False)
#confusion_EC_primary_trip = get_primary_mode_aggregate_EC(expanded_labeled_trips,True, unit_dist_MCS_df, android_EI_moments_df, ios_EI_moments_df, gis_sensed_modes,energy_dict,False,False)
#user_EC_trip = get_primary_mode_aggregate_EC(expanded_labeled_trips,False, unit_dist_MCS_df, android_EI_moments_df, ios_EI_moments_df, gis_sensed_modes,energy_dict,False,False)
prediction_EC_primary_trip[0],user_EC_trip[0],confusion_EC_primary_trip[0]

In [None]:
np.diff([prediction_EC_primary_trip[0],user_EC_trip[0],confusion_EC_primary_trip[0]])

In [None]:
# drop air trips
# Constructing the propagation function.

use_sensing_only = True

# probably should get segments ahead of time and store in the database.

# get_aggregate_EC(trips_df, only_sensing, unit_dist_MCS, android_EI_moments, ios_EI_moments,gis_sensed_modes, energy_dict):
mean_EC_all_sensing_vail, var_EC_all_sensing_vail = get_aggregate_EC(expanded_labeled_trips,use_sensing_only, unit_dist_MCS_df, android_EI_moments_df, ios_EI_moments_df, gis_sensed_modes,energy_dict)

mean_EC_all_user_labeled_vail, var_EC_all_user_labeled_vail = get_aggregate_EC(expanded_labeled_trips,False, unit_dist_MCS_df, android_EI_moments_df, ios_EI_moments_df, gis_sensed_modes,energy_dict)

In [None]:
# with no shared rides.
elt = expanded_labeled_trips.copy()
no_shared_rides = elt.drop(elt[elt.mode_confirm == "shared_ride"].index)

# use a confusion matrix that has 
android_EI_moments_no_shared_ride, ios_EI_moments_no_shared_ride = (pd.read_csv("android_EI_moments_no_shared_ride.csv").set_index("mode"),pd.read_csv("ios_EI_moments_no_shared_ride.csv").set_index("mode"))
mean_EC_no_shared, var_EC_no_shared = get_aggregate_EC(no_shared_rides,True, unit_dist_MCS_df, android_EI_moments_no_shared_ride, ios_EI_moments_no_shared_ride, gis_sensed_modes,energy_dict)
mean_EC_no_shared_users, var_EC_no_shared_users = get_aggregate_EC(no_shared_rides,False, unit_dist_MCS_df, android_EI_moments_no_shared_ride, ios_EI_moments_no_shared_ride, gis_sensed_modes,energy_dict)
mean_EC_naive_no_shared, var_EC_naive_no_shared, avg_EI_no_shared= get_aggregate_EC_with_extras(no_shared_rides,True, unit_dist_MCS_df, android_EI_moments_no_shared_ride, ios_EI_moments_no_shared_ride, gis_sensed_modes,energy_dict,use_naive_sensing_prediction=True)
print(f"EC from: sensing with confusion, sensing prediction, user labels:{mean_EC_no_shared,mean_EC_naive_no_shared,mean_EC_no_shared_users}")

In [None]:
mean_EC_naive_no_shared

In [None]:
mean_EC_naive_no_shared, var_EC_naive_no_shared, avg_EI_no_shared= get_aggregate_EC_with_extras(no_shared_rides,True, unit_dist_MCS_df, android_EI_moments_no_shared_ride, ios_EI_moments_no_shared_ride, gis_sensed_modes,energy_dict,use_naive_sensing_prediction=True,car_load_factor = 1)

In [None]:
def get_aggregate_EC_with_extras(trips_df, only_sensing, unit_dist_MCS, android_EI_moments, ios_EI_moments,gis_sensed_modes, energy_dict, use_naive_sensing_prediction=False, car_load_factor=1.5):
    # requires the trips dataframe to have expanded labeled trips

    # The load factor here only updates the predicted mode EI, not the confusion EI. See store_errors.ipynb to save EI_moments dataframes with different load factors.
    drove_alone_EI = energy_dict["Gas Car, drove alone"]
    energy_dict.update({"Gas Car, sensed": drove_alone_EI/car_load_factor})

    mean_EC_agg = 0    # aggregate energy consumption
    var_EC_agg = 0  
    sum_sensed_mean_EI = 0
    N_sections = 0

    sum_labeled_mean_EI = 0
    n_trips = 0

    ios_count = 0
    android_count = 0

    for  _,ct in trips_df.iterrows():
        # Get operating system
        u = ecwu.User(ct.user_id)
        os = u.getProfile()["curr_platform"]

        if os == "ios": 
            ios_count+=1 
        else: 
            android_count += 1

        # Get OS specific trip length info.
        mean_for_unit_L = unit_dist_MCS_df[os]["mean"]
        var_for_unit_L = unit_dist_MCS_df[os]["var"]


        # Get trip mode info.
        # Later the condition will be whether the model chosen is sensing.
        if only_sensing == True:
            # Get segments for the trip.
            n_sections = len(ct["section_modes"])
            section_modes = ct["section_modes"]
            sections_lengths = np.array(ct["section_distances"])*METERS_TO_MILES   # 1 meter = 0.000621371 miles

            mean_L = sections_lengths*mean_for_unit_L
            var_L = sections_lengths**2 * var_for_unit_L
            
            for s in range(0,n_sections):
                # EI mean and variance.
                # Perhaps it would be better to keep the moments in the same file?

                if section_modes[s] == "air_or_hsr": continue

                if use_naive_sensing_prediction:
                    #mean_EI = energy_dict[MODE_MAPPING_DICT[section_modes[s]]]
                    if section_modes[s] == 'car':
                        mean_EI = energy_dict['Gas Car, sensed']
                    else:
                        mean_EI = energy_dict[MODE_MAPPING_DICT[section_modes[s]]]
                    var_EI = 0
                else:
                    # Later: switch to a map style function.
                    mean_EI, var_EI = get_EI_moments_for_trip(section_modes[s],os,android_EI_moments,ios_EI_moments)

                sum_sensed_mean_EI += mean_EI
                N_sections += 1

                # Propagate variance for the trip
                mean_EC = mean_L[s]*mean_EI
                var_EC = var_EI*mean_L[s]**2 + var_L[s]*mean_EI**2

                # Add to total - follows from assumed independence of section errors.
                mean_EC_agg += mean_EC
                var_EC_agg += var_EC
        
        # use user labels.
        else:
            mode = ct["mode_confirm"]  # need to make sure you convert it to an appropriate energy intensity.

            if mode not in MODE_MAPPING_DICT or mode == np.nan: continue
            if MODE_MAPPING_DICT[mode] == "Air": continue
            EI = energy_dict[MODE_MAPPING_DICT[mode]]

            sum_labeled_mean_EI += EI
            n_trips += 1

            length = ct["distance"]*METERS_TO_MILES
            mean_L = length* mean_for_unit_L  
            var_L = length**2 * var_for_unit_L

            mean_EC_agg += EI*mean_L
            var_EC_agg +=  EI*var_L

        avg_EI = sum_sensed_mean_EI/N_sections if only_sensing == True else sum_labeled_mean_EI/n_trips

    #print(f"ios vs android trip count: {ios_count,android_count}")
    #print(f"Sum of EIs (sensed, user labeled): {sum_sensed_mean_EI,sum_labeled_mean_EI}")   # could weight by distance
    #print(f"number of sections or trips: {N_sections,n_trips}")
    #print(air_count)
    return mean_EC_agg, var_EC_agg, avg_EI

#mean_EC_naive_sensing_vail, var_EC_naive_sensing_vail, avg_EI_sensed_naive = get_aggregate_EC_with_extras(expanded_labeled_trips,use_sensing_only, unit_dist_MCS_df, android_EI_moments_df, ios_EI_moments_df, gis_sensed_modes,energy_dict,use_naive_sensing_prediction=True)

In [None]:
########## OLD SENSITIVITY ANALYSIS
##########
# Split the data.
# 10^3 NMC takes 10 seconds on vail to create all 4 splits.
proportion_sensed = [0.2,0.4,0.6,0.8]
NMC = 10#**2#**3

summary_df_map = {}
for ps in proportion_sensed:
    
    mean_EC_agg = []
    var_EC_agg = []
    for j in range(0,NMC):
        rand_state = np.random.RandomState(1+j)

        # Split the labeled trips into a user labeled dataframe and a sensed dataframe
        user_labeled,sensed  = skm.train_test_split(expanded_labeled_trips, 
                                                    test_size = ps, # sensed
                                                    train_size = 1-ps,  # user_labeled
                                                    random_state= rand_state)
        mean_EC_sensed, var_EC_sensed = \
            get_aggregate_EC(sensed,True, unit_dist_MCS_df, android_EI_moments_df, ios_EI_moments_df, gis_sensed_modes,energy_dict)
        
        mean_EC_user_labeled, var_EC_user_labeled = \
            get_aggregate_EC(user_labeled,False, unit_dist_MCS_df, android_EI_moments_df, ios_EI_moments_df, gis_sensed_modes,energy_dict)

        mean_EC_agg.append(mean_EC_sensed + mean_EC_user_labeled)
        var_EC_agg.append(var_EC_sensed + var_EC_user_labeled)

        sd_EC_agg = np.sqrt(np.array(var_EC_agg))

        summary_df_map[ps] = pd.DataFrame({"mean": mean_EC_agg, "sd": sd_EC_agg})
        
        # prop var sensed
        # prop var user labeled
average_summaries = {}
for ps in proportion_sensed:
    average_across_splits_mean = np.mean(summary_df_map[ps]["mean"])
    average_across_splits_sd = np.mean(summary_df_map[ps]["sd"])
    average_summaries[ps] = {"mean": average_across_splits_mean, "sd": average_across_splits_sd}
average_summaries

sd_sensed = np.sqrt(var_EC_all_sensing_vail)
sd_users = np.sqrt(var_EC_all_user_labeled_vail)

def get_interval(mean,sd):
    return [mean -sd, mean,mean + sd]

interval_sensed_vail = get_interval(mean_EC_all_sensing_vail,sd_sensed)
interval_users_vail = get_interval(mean_EC_all_user_labeled_vail,sd_users)

import matplotlib.pyplot as plt
fig,ax = plt.subplots()
fig.set_figheight(6)

ax.plot([0]*3,interval_users_vail,'bo') 
j = 1
for ps in proportion_sensed:
    summary = average_summaries[ps]
    x = [ps]*3
    y = get_interval(summary["mean"],summary["sd"])
    ax.plot(x,y,'bo')
    j+=1
ax.plot([1]*3,interval_sensed_vail,'bo')
ax.set_ylim([4000,12000])
ax.set_xlabel("Proportion of trips using sensing as opposed to user labels")

fig.suptitle("Energy consumption mean +- 1 sd as percent of sensed trips increases")

In [None]:
# horizontal version of the plots
import matplotlib.pyplot as plt
fig,axs = plt.subplots(6,sharex=True)
fig.set_figheight(10)
y = [0,0,0]
axs[0].plot(interval_users_vail,y,'bo') 
j = 1
for ps in proportion_sensed:
    summary = average_summaries[ps]
    x = [ps]*3
    y = get_interval(summary["mean"],summary["sd"])
    axs[j].plot(x,y,'bo')
    j+=1
axs[5].plot(interval_sensed_vail,y,'bo')

fig.suptitle("0 -> 1 proportion sensed in increments of 0.2 (mean +- 1 sd)")



In [None]:
sd_EC_agg = np.sqrt(np.array(var_EC_agg))

summary_df = pd.DataFrame({"mean": mean_EC_agg, "sd": sd_EC_agg}).set_index(proportion_sensed)