## Save confusion matrices and distance errors, and construct MODE_MAPPING_DICT
The values were manually pulled from Gabe's paper.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import statistics

import confusion_matrix_handling as cm_handling
from confusion_matrix_handling import MODE_MAPPING_DICT

import sklearn.model_selection as skm

df_EI = pd.read_csv(r'Public_Dashboard/auxiliary_files/energy_intensity.csv')

In [2]:
# Base mode map for GIS. Not directly used in this notebook but nice to see the modes the sensing algorithm labels.
gis_sensed_modes = {0 : 'no_sensed',    # UNKNOWN  #NOTE: this is important info to mention.
        1 : 'walking',    # WALKING
        2 : 'bicycling',    # BICYCLING
        3 : 'bus',        # BUS
        4 : 'train',      # TRAIN
        5 : 'car',        # CAR
        6 : 'air_or_hsr', # AIR_OR_HSR
        7 : 'subway',      # SUBWAY
        8 : 'train',      # TRAM
        9 : 'train',      # LIGHT_RAIL
}

# Dictionary of energy intensities in kWH/PMT
energy_dict = cm_handling.get_energy_dict(df_EI)


# sensed_car -> (via MODE_MAPPING_DICT) “Car, sensed” in energy dict, 
# which is used for the ground truth car intensity in get_conditional_EI_expectation_and_variance(). 
# Then the sensed mode will show car, but the EI used will be based on a car with a 1.5 person load factor.
drove_alone_EI = energy_dict["Gas Car, drove alone"]
r = 1
load_factor = (r+1)/(r+1/2) 
energy_dict.update({"Car, sensed": drove_alone_EI/load_factor})

### Saving distance error info

In [8]:
## Sampling distance errors to get a distribution of possible distances.

android_rel_dist_errors = pd.read_csv("android_rel_distance_errors.csv")
ios_rel_dist_errors = pd.read_csv("ios_rel_distance_errors.csv")

# Monte Carlo (MC) simulate android relative distance errors (R).
android_R_MC = np.random.choice(android_rel_dist_errors['x'], size = 10**5, p= android_rel_dist_errors['p(x)'])
ios_R_MC = np.random.choice(ios_rel_dist_errors['x'], size = 10**5, p= ios_rel_dist_errors['p(x)'])

# Compute a Monte Carlo simulated (MCS) distance estimate for a unit length trip, given the relative error.
# Inputs: inferred trip length of 1, randomly assigned relative error.
# Output: an estimate of the "true" distance based on the relative error. 
# Since relative error is typically negative, the "true" distance will usually be bigger than the inferred distance.
android_MCS_dist = 1/(1+ android_R_MC)
ios_MCS_dist = 1/(1 + ios_R_MC)

# Compute the sample means and variances.
android_unit_dist_MCS_summary = {"mean": statistics.mean(android_MCS_dist), "var": statistics.variance(android_MCS_dist)}
ios_unit_dist_MCS_summary = {"mean": statistics.mean(ios_MCS_dist), "var": statistics.variance(ios_MCS_dist)}

unit_dist_MCS_df = pd.DataFrame({"android":android_unit_dist_MCS_summary, "ios":ios_unit_dist_MCS_summary})
unit_dist_MCS_df.to_csv("/Users/mallen2/OpenPATH_Data/e-mission-eval-private-data/Error_bars/unit_distance_MCS.csv", index_label= "moment")

### Saving confusion matrices

In [3]:
# Confusion matrices based on Gabe's paper.
android_GIS_HAMFDC = np.array([
    [24743,1595,564,682,235,359,2835,3365,436], 
    [230,10288,0,0,0,0,121,197,0],
    [0,4957,0,0,0,0,107,0,0],
    [0,4957,0,0,0,0,107,0,0], # Mock ebike values, copied from escooter (duration of true ebike given each mode prediction)
    [0,0,7582,0,0,0,877,24,0],
    [0,0,8414,15607,0,0,0,589,0],
    [0,0,0,0,19651,0,0,24,0],
    [795,825,0,2920,0,0,0,308,0],
    [13,0,0,0,0,35726,0,415,0],
    [941,0,115,0,0,0,0,0,0],
    [823,223,18,1016,563,815,0,14735,0],
    [1024,14,3,0,0,0,0,0,0]
    ])

ios_GIS_HAHFDC = np.array([
    [26606,992,768,593,243,551,6033,145,1329],
    [425,11938,0,0,0,0,1958,28,0],
    [1314,4529,516,0,0,0,1119,2,0],
    [1314,4529,516,0,0,0,1119,2,0], # Mock ebike values
    [246,0,10922,0,0,0,2799,1,0],
    [1041,190,11177,16877,0,0,0,98,0],
    [0,0,0,0,19573,0,0,7,0],
    [1829,0,0,3111,0,0,0,12,0],
    [5,5,5836,0,0,31087,0,93,0],
    [124,18,0,0,0,0,0,0,0],
    [10430,1019,1032,1274,1682,1475,0,2457,0],
    [2973,0,264,0,0,0,0,0,0]
])

sensing_predictions = ["walking", "bicycling","car","bus","subway","train", "no_start", "no_middle","no_end"]
sensing_gt = ["walking","bicycling","escooter","ebike","sensed_car", "bus", "subway","light_rail","train","no_gt_start","no_gt_middle","no_gt_end"]

android_confusion_GIS_HAMFDC = pd.DataFrame(android_GIS_HAMFDC, index = sensing_gt, columns= sensing_predictions)
ios_confusion_GIS_HAHFDC = pd.DataFrame(ios_GIS_HAHFDC, index = sensing_gt, columns= sensing_predictions)
#android_confusion_GIS_HAMFDC

In [5]:
# This cell is to save copies of the confusion matrices with ground truth labels that are consistent with those seen in energy_intensity.csv

# Android
collapsed_android_cm = cm_handling.collapse_confusion_matrix(android_confusion_GIS_HAMFDC)
collapsed_android_cm = collapsed_android_cm.rename(mapper= MODE_MAPPING_DICT, axis="index")

# IOS
collapsed_ios_cm = cm_handling.collapse_confusion_matrix(ios_confusion_GIS_HAHFDC)
collapsed_ios_cm = collapsed_ios_cm.rename(mapper= MODE_MAPPING_DICT, axis="index")

# Store them
collapsed_android_cm.to_csv("/Users/mallen2/OpenPATH_Data/e-mission-eval-private-data/Error_bars/android_confusion.csv", index_label= "gt_mode")
collapsed_ios_cm.to_csv("/Users/mallen2/OpenPATH_Data/e-mission-eval-private-data/Error_bars/ios_confusion.csv", index_label = "gt_mode")

### The code below was used to generate MODE_MAPPING_DICT.
I copied the printed version of dic_mode to the end of confusion_matrix_handling.py and named it MODE_MAPPING_DICT.
Note that modes similar to "car" or "carpool" labels get mapped to "Gas Car, drove alone" or "Gas Car, with others".

In [None]:
# dic_re is generated in mapping_dictionaries.ipynb
%store -r dic_re
# Note: dic_mode is most relevant for energy and carbon intensities. We may want to use a different mapping for mode share.
dic_mode = dic_re.copy()

# Add old mode labels.
old_mode_labels = {"Bike": "Regular Bike", "pilot_ebike": "Pilot ebike","e-bike": "Pilot ebike",
                    "Drove Alone": "Gas Car, drove alone", 
                    "Shared Ride": "Gas Car, with others", 
                    "taxi": "Taxi/Uber/Lyft",
                    "Air": "Air"}

dic_mode.update(old_mode_labels)

# Add the mode labels from the energy intensity csv just in case.
dic_mode.update({m:m for m in df_EI['mode']}) 

# Add the ground truth labels from mobilitynet.
mobilitynet_gis_gt = {
    "walking": "Walk",
    "bicycling": "Regular Bike",
    "escooter": "Scooter share",
    "ebike": "Pilot ebike",
    "car": "Gas Car, drove alone",  # car BTU: 5170,     #1 BTU = 0.000293071 kWH
    "bus": "Bus",  # bus BTU: 4560
    "subway": "Train",     # using train value
    "light_rail": "Train",  # using train value
    "train": "Train",
    "no_gt": "no_gt"
}

# There is a difference between the ground truth possible values and the possible inferred values.


# Add other label assist labels
la_mode = {'air': "Air",  
    'car': "Gas Car, drove alone",
    'electric_vehicle': "E-car, drove alone",
    'skiing': "Walk",
    'snowboarding': "Walk",
    'subway': "Train" } 

dic_mode.update(la_mode)
dic_mode.update(mobilitynet_gis_gt)
dic_mode.update({"air_or_hsr": "Train", "no_sensed":"Not a Trip", "sensed_car": "Car, sensed"})

#dic_mode = defaultdict(lambda: 'Other',dic_mode) # not in use at the moment since we have no  "other" EI

In [None]:
# Checking to make sure I didn't make a change directly to MODE_MAPPING_DICT in confusion_matrix_handling.py without changing it in the cell above.
# (confusion_matrix_handling.py has a fully written out version of dic_mode)
NEW_MODE_MAPPING_DICT = dic_mode

for key in MODE_MAPPING_DICT:
    if MODE_MAPPING_DICT[key] != NEW_MODE_MAPPING_DICT[key]:
        print(f"key value difference: the value of {key} in MODE_MAPPING_DICT is {MODE_MAPPING_DICT[key]}, but in NEW_MODE_MAPPING_DICT it is {NEW_MODE_MAPPING_DICT[key]}")

for key in NEW_MODE_MAPPING_DICT:
    if MODE_MAPPING_DICT[key] != NEW_MODE_MAPPING_DICT[key]:
        print(f"key value difference: the value of {key} in MODE_MAPPING_DICT is {MODE_MAPPING_DICT[key]}, but in NEW_MODE_MAPPING_DICT it is {NEW_MODE_MAPPING_DICT[key]}")
len(NEW_MODE_MAPPING_DICT.keys()), len(MODE_MAPPING_DICT.keys())