# Try spatial covariance and compare it with other variance calculation methods
Looks at how many standard deviations the sensed value is from the user labeled value.
Calculate variance by summing trip level variances and including a spatial covariance term based on trip clusters.

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from uuid import UUID

import matplotlib.pyplot as plt

import sys
sys.path.append('/Users/mallen2/alternate_branches/gis-with-build-label-model/e-mission-server')

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.core.wrapper.user as ecwu

import confusion_matrix_handling as cm_handling
from confusion_matrix_handling import MODE_MAPPING_DICT
import get_EC
import helper_functions as hf

import sklearn.model_selection as skm

from sklearn.model_selection import KFold
from sklearn import linear_model

# import folium

# For fetching the trip clusters
import emission.analysis.modelling.trip_model.model_storage as eamtm
import emission.analysis.modelling.trip_model.model_type as eamum

METERS_TO_MILES = 0.000621371 # 1 meter = 0.000621371 miles
ECAR_PROPORTION = 0 #0.01 #~1% of cars on the road are electric.
DROVE_ALONE_TO_SHARED_RIDE_RATIO = 1

df_EI = pd.read_csv(r'Public_Dashboard/auxiliary_files/energy_intensity.csv') # r stands for raw string, only matters if the path is on Windows

In [None]:
# If you already ran store_expanded_labeled_trips.ipynb already and want to save time vs running the cell below
%store -r expanded_labeled_trips

In [None]:
'''import database_related_functions as drf  # all the emission server functions for this notebook are in here.
user_list, os_map, uuid_program_map = drf.get_participants_programs_and_operating_systems()
#print(len(user_list), len(os_map), len(uuid_program_map))

# Takes ~ 1 min 45 s to 2 min 45 s on Macbook Pro for all ceo data up to May 2022.
expanded_labeled_trips = drf.get_expanded_labeled_trips(user_list)
expanded_labeled_trips['os'] = expanded_labeled_trips.user_id.map(os_map)
expanded_labeled_trips['program'] = expanded_labeled_trips['user_id'].map(uuid_program_map)

expanded_labeled_trips = expanded_labeled_trips.drop(labels = ['source', 'end_fmt_time', 'end_loc', 'raw_trip',
    'start_fmt_time', 'start_loc','start_local_dt_year', 'start_local_dt_month', 'start_local_dt_day',
    'start_local_dt_hour', 'start_local_dt_minute', 'start_local_dt_second',
    'start_local_dt_weekday', 'start_local_dt_timezone',
    'end_local_dt_year', 'end_local_dt_month', 'end_local_dt_day',
    'end_local_dt_hour', 'end_local_dt_minute', 'end_local_dt_second',
    'end_local_dt_weekday', 'end_local_dt_timezone'], axis = 1)

expanded_labeled_trips['distance_miles'] = expanded_labeled_trips.distance*METERS_TO_MILES

# Group together the prepilot participants
prepilot_list = ['84Q9SsrH','cwZazZLJ','CudLAeg8','sxxcLqbK','Q8T7QTXK','5KEGHHuf','e9MaNVU7','7c797MRD','rhBZukxY','k36cxmfA','FmxVf8u6','F3jxHLSW']
expanded_labeled_trips['program'] = expanded_labeled_trips.program.replace(prepilot_list, "prepilot")'''

In [None]:
expanded_labeled_trips = hf.drop_unwanted_trips(expanded_labeled_trips, drop_not_a_trip=False)
# If you want to double check whether you included not a trip: 'not_a_trip' in expanded_labeled_trips.mode_confirm.unique()

expanded_labeled_trips = hf.get_primary_modes(expanded_labeled_trips,energy_dict,MODE_MAPPING_DICT)

In [None]:
# Assign each trip to a cluster. Takes 13 minutes on all ceo up to May.
# First you need to find the clusters with build_label_model.py.
# (the script in e-mission-server that does the greedy similarity binning)
# source .../e-mission-py.bash  bin/build_label_model.py -a   (-a is for all users)

# start with the dataframe of the trips that we are interested in.
############## Make sure that this is a precomputation if you decide to use spatial covariance.###############

#expanded_labeled_trips = expanded_labeled_trips.set_index('_id')
expanded_labeled_trips['cluster_id'] = ['0']*len(expanded_labeled_trips)
expanded_labeled_trips['trip_neighbors'] = [['0']]*len(expanded_labeled_trips)
expanded_labeled_trips['cluster_size'] = ['0']*len(expanded_labeled_trips)

for user_id in expanded_labeled_trips.user_id.unique():
    # get the cluster for this user
    user_trip_clusters = eamtm.load_model(user_id, eamum.ModelType.GREEDY_SIMILARITY_BINNING, eamtm.ModelStorage.DOCUMENT_DATABASE)
    if user_trip_clusters is not None:
        for cluster_number in user_trip_clusters.keys():
            trips_in_cluster = user_trip_clusters[cluster_number]['trip_ids']  # want trips such that they are in the cluster and in the dataframe.
            relevant_trip_indices = expanded_labeled_trips[expanded_labeled_trips['_id'].isin(trips_in_cluster)].index

            trips_in_cluster_and_df = expanded_labeled_trips.loc[relevant_trip_indices]['_id']

            # Set the cluster number for all trips in the current cluster.

            for idx, trip_id in zip(relevant_trip_indices,trips_in_cluster):
                expanded_labeled_trips.loc[idx,'cluster_id'] = str(user_id) + '_' + cluster_number

                expanded_labeled_trips.loc[idx,'cluster_size'] = len(trips_in_cluster_and_df)

                # set a trip's neighbors as all cluster members excluding itself.
                expanded_labeled_trips.at[idx,'trip_neighbors'] = set(trips_in_cluster_and_df).difference({trip_id})
    else:
        user_index = expanded_labeled_trips[expanded_labeled_trips.user_id == user_id].index
        expanded_labeled_trips.loc[user_index, ['cluster_id', 'cluster_size']] = '1',1

        #expanded_labeled_trips.loc[relevant_trip_indices,'cluster_id'] = str(user_id) + '_' + cluster_number # cluster number is a string

print("Histogram of cluster size:")
expanded_labeled_trips.cluster_size.hist(bins=30); plt.show()

In [None]:
# extra packages I installed but ultimately didn't use. # shapely; geopandas; pysal

In [None]:
# To save time if you want to run this notebook again:
#all_ceo_with_clusters_expanded_labeled_trips = expanded_labeled_trips.copy()
#%store all_ceo_with_clusters_expanded_labeled_trips

In [None]:
%store -r all_ceo_with_clusters_expanded_labeled_trips
expanded_labeled_trips = all_ceo_with_clusters_expanded_labeled_trips.copy()
del all_ceo_with_clusters_expanded_labeled_trips

In [None]:
# A look at how many trips are in clusters rather than just one time trips.
number_of_clustered_trips = len(expanded_labeled_trips[expanded_labeled_trips.cluster_size > 1])
number_of_single_trips = len(expanded_labeled_trips)-number_of_clustered_trips

print(number_of_clustered_trips, number_of_single_trips)

In [None]:
unit_dist_MCS_df = pd.read_csv("unit_distance_MCS.csv").set_index("moment")
energy_dict = cm_handling.get_energy_dict(df_EI)

In [None]:
# Get the confusion matrices and then the EI moments from those.
android_confusion = pd.read_csv("android_confusion.csv").set_index('gt_mode')
ios_confusion = pd.read_csv("ios_confusion.csv").set_index('gt_mode')

android_confusion = cm_handling.collapse_confusion_matrix(android_confusion, rows_to_collapse={"Train": ["Train"]}, columns_to_collapse={})
ios_confusion = cm_handling.collapse_confusion_matrix(ios_confusion, rows_to_collapse={"Train": ["Train"]}, columns_to_collapse={})

sensed_car_EI = hf.find_sensed_car_energy_intensity(energy_dict, ECAR_PROPORTION, DROVE_ALONE_TO_SHARED_RIDE_RATIO)
energy_dict.update({"Car, sensed": sensed_car_EI})
expanded_labeled_trips['distance_miles'] = expanded_labeled_trips.distance*METERS_TO_MILES
EI_length_cov = 0

In [None]:
# if you forget this step, the error for expected may be different, 
# since you might be relying on a different saved version of the EI_moments_dataframe
android_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(android_confusion,energy_dict)
ios_EI_moments_df = cm_handling.get_conditional_EI_expectation_and_variance(ios_confusion,energy_dict)
os_EI_moments_map = {'ios': ios_EI_moments_df, 'android': android_EI_moments_df}
energy_consumption_df = get_EC.compute_all_EC_values(expanded_labeled_trips,unit_dist_MCS_df,energy_dict,android_EI_moments_df,ios_EI_moments_df, \
    EI_length_cov, print_info=False)

In [None]:
# Demonstration of finding a spatial covariance for vail.
get_EC.spatial_autocov_based_on_clusters(energy_consumption_df[energy_consumption_df.program == 'vail'],'user_labeled', print_statistics=True)

### Find the spatial autocovariance specific to each user.

In [None]:
# Takes ~1 minute on all ceo up to May 2022.
user_spatial_cov_map = {}
user_morans_I_map = {}
for user in energy_consumption_df.user_id.unique():
    user_df = energy_consumption_df[energy_consumption_df.user_id == user].copy()
    if len(user_df) < 2: 
        user_spatial_cov_map[user] = 0
        morans_I = 0
    else:
        user_spatial_cov_map[user], morans_I = get_EC.spatial_autocov_based_on_clusters(user_df, 'expected')
    user_morans_I_map[user] = morans_I

In [None]:
plt.hist(user_morans_I_map.values(), bins = 50)
plt.show()

In [None]:
# Double checking that I didn't include a trip in the list of its neighbors
mistaken_extra_ids = []
for i,ct in energy_consumption_df.iterrows():
    if ct['_id'] in ct['trip_neighbors']:
        print('trip should not be counted as one of its neighbors.')
        mistaken_extra_ids.append(ct['_id'])

# looks like a few slipped through somehow

In [None]:
# What happens if we use 1 spatial covariance for all users? # Takes 1 min 15 s
cov_sum = 0
for cluster_id in energy_consumption_df.cluster_id.unique():
    cluster_size = len(energy_consumption_df[energy_consumption_df.cluster_id == cluster_id])
    if cluster_size > 1:
        cov_sum += 15.504*(cluster_size**2 - cluster_size)  # I calculated a CEO dataset wide spatial cov of 15.504
larger_sd = np.sqrt(energy_consumption_df.confusion_var.sum() + cov_sum)
print(f"aggregate standard deviation: {larger_sd}")
error = energy_consumption_df.expected.sum() - energy_consumption_df.user_labeled.sum()
print(f"Error for expected: {error:.2f}")

In [None]:
# What happens if Moran's I is maxed out for each participant, leading to just adding the variance for each user in each cluster?
# Concretely, instead of using the autocovariance that I calculated for each user, I would just use the variance of expected EI estimates calculated for each user.
from math import factorial
def n_choose_r(n,r):
    return factorial(n)/(factorial(r)*factorial(n-r))

user_variance_map = {}
for user in energy_consumption_df.user_id.unique():
    user_df = energy_consumption_df[energy_consumption_df.user_id == user].copy()
    user_variance_map[user] = np.var(user_df.expected)

max_cov_sum = 0
for user in energy_consumption_df.user_id.unique():
    # Get the trips associated with this user.
    user_df = energy_consumption_df[energy_consumption_df.user_id == user].copy()
    for cluster_id in user_df.cluster_id.unique():
        cluster_size = len(user_df[user_df.cluster_id == cluster_id])
        if cluster_size > 1:
            max_cov_sum += 2*user_variance_map[user]*n_choose_r(cluster_size,2)  # I later switched to n**2 - n.  2* n_choose_2 == n**2 - n. 

max_spatial_cov_sd = np.sqrt(energy_consumption_df.confusion_var.sum() + max_cov_sum)
print(f"aggregate standard deviation: {max_spatial_cov_sd}")
error = energy_consumption_df.expected.sum() - energy_consumption_df.user_labeled.sum()
print(f"Error for expected: {error:.2f}")

In [None]:
larger_sd = np.sqrt(get_EC.compute_variance_including_spatial_cov_for_trips_dataframe(energy_consumption_df,user_spatial_cov_map))
print(f"aggregate standard deviation: {larger_sd}")
error = energy_consumption_df.expected.sum() - energy_consumption_df.user_labeled.sum()
print(f"Error for expected: {error:.2f}")

In [None]:
energy_consumption_from_primary_mode_df = get_EC.compute_all_EC_values_from_primary_mode(expanded_labeled_trips, unit_dist_MCS_df, energy_dict, android_EI_moments_df,ios_EI_moments_df)

In [None]:
program_n_sd_map_agg_distance = hf.plot_estimates_with_sd_by_program(energy_consumption_df,os_EI_moments_map,unit_dist_MCS_df, variance_method='aggregate_section_distances', user_spatial_cov_map = user_spatial_cov_map)
program_n_sd_map_agg_primary_mode_distance = hf.plot_estimates_with_sd_by_program(energy_consumption_from_primary_mode_df,os_EI_moments_map,unit_dist_MCS_df, variance_method='aggregate_primary_mode_distances', user_spatial_cov_map = user_spatial_cov_map)
program_n_sd_map_spatial_cov = hf.plot_estimates_with_sd_by_program(energy_consumption_df,os_EI_moments_map,unit_dist_MCS_df, variance_method='spatial_cov', user_spatial_cov_map = user_spatial_cov_map)
program_n_sd_map_individual = hf.plot_estimates_with_sd_by_program(energy_consumption_df,os_EI_moments_map,unit_dist_MCS_df, variance_method='independent', user_spatial_cov_map = user_spatial_cov_map)

In [None]:
n_sd_df = pd.DataFrame([program_n_sd_map_agg_distance, program_n_sd_map_agg_primary_mode_distance, program_n_sd_map_spatial_cov,program_n_sd_map_individual])
print(n_sd_df.rename(index = {0:'Aggregate Section Distance', 1: 'Aggregate Primary Mode Distance', 2:'Spatial Covariance', 3:'Independent Trips'}).to_latex())

### Plot estimates plus or minus one standard deviation for each program
The left dots in each plot are for user labeled values.
The right three dots in each plot are expected aka confusion based values.

In [None]:
# This cell plots the user labeled and expected aggregate energy consumptions on the left and right, respectively.
# It finds aggregate variance by summing individual variances and adding a spatial covariance term.

program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_df,os_EI_moments_map,unit_dist_MCS_df, variance_method='spatial_cov', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

## Try with Bayes update. 
### How does spatial covariance perform with different assumed mode distributions than that of MobilityNet?

In [None]:
prior_probs_prespecified = {"Car, sensed": 0.7, "Pilot ebike": 0.13}
prior_probs = prior_probs_prespecified.copy()
n_other_modes = len(android_confusion.index) - len(prior_probs_prespecified)
probability_remaining = 1 - sum(prior_probs_prespecified.values())
prior_probs.update({x: probability_remaining/n_other_modes for x in android_confusion.index if x not in prior_probs_prespecified.keys()})
#prior_probs = {x: 1/len(android_confusion.index) for x in android_confusion.index} # if you want a uniform prior.

android_EI_moments_with_Bayes_update_df = cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(android_confusion,energy_dict, prior_probs)
ios_EI_moments_with_Bayes_update_df = cm_handling.get_Bayesian_conditional_EI_expectation_and_variance(ios_confusion,energy_dict, prior_probs)
os_EI_moments_with_Bayes_update_map = {'ios': ios_EI_moments_with_Bayes_update_df, 'android': android_EI_moments_with_Bayes_update_df}
energy_consumption_with_Bayes_update_df = get_EC.compute_all_EC_values(expanded_labeled_trips,unit_dist_MCS_df,energy_dict,\
    android_EI_moments_with_Bayes_update_df,\
    ios_EI_moments_with_Bayes_update_df, \
    EI_length_cov, print_info=False)

In [None]:
# Find a spatial autocovariance specific to each user. Takes ~1 minute on all ceo up to May 2022.
user_spatial_cov_map = {}
user_morans_I_map = {}
for user in energy_consumption_with_Bayes_update_df.user_id.unique():
    user_df = energy_consumption_with_Bayes_update_df[energy_consumption_with_Bayes_update_df.user_id == user].copy()
    if len(user_df) < 2: 
        user_spatial_cov_map[user] = 0
        morans_I = 0
    else:
        user_spatial_cov_map[user], morans_I = get_EC.spatial_autocov_based_on_clusters(user_df, 'expected')
    user_morans_I_map[user] = morans_I

In [None]:
program_percent_errors = pd.DataFrame(hf.get_program_percent_error_map(energy_consumption_with_Bayes_update_df), index=[0]).round(2)
print(f"percent error for each program:")
program_percent_errors

In [None]:
# with prior of 0.7 car, 0.13 ebike.
program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_with_Bayes_update_df,os_EI_moments_with_Bayes_update_map, unit_dist_MCS_df, variance_method='spatial_cov', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

In [None]:
# with prior of 0.7 car, 0.13 ebike. didn't update spatial cov map from uniform version
program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_with_Bayes_update_df,os_EI_moments_with_Bayes_update_map, unit_dist_MCS_df, variance_method='spatial_cov', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

In [None]:
# with uniform prior:
program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_with_Bayes_update_df,os_EI_moments_with_Bayes_update_map, unit_dist_MCS_df, variance_method='spatial_cov', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

In [None]:
# with uniform prior and aggregate distance.
program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_with_Bayes_update_df,os_EI_moments_with_Bayes_update_map, unit_dist_MCS_df, variance_method='aggregate_distance', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

In [None]:
# with car 0.7, ebike 0.13 and aggregate distance.
program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_with_Bayes_update_df,os_EI_moments_with_Bayes_update_map, unit_dist_MCS_df, variance_method='aggregate_distance', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

In [None]:
# with pcar 0.5, p ebike 0.13:
program_n_sd_map = hf.plot_estimates_with_sd_by_program(energy_consumption_with_Bayes_update_df,os_EI_moments_with_Bayes_update_map, unit_dist_MCS_df, variance_method='spatial_cov', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

### How does spatial covariance perform for a mostly ebike dataset, using the mobilitynet implicit prior?

In [None]:
def construct_mostly_ebike_df(df):
    df = df.copy()
    all_ebike_trips = df[df.mode_confirm == 'pilot_ebike'].copy()
    not_ebike_trips = df[df.mode_confirm != 'pilot_ebike'].copy()
    n_trips_over_2 = int(np.floor(len(df)/2))
    ebike_trip_list = np.random.choice(all_ebike_trips._id,len(df))
    other_trip_list = np.random.choice(not_ebike_trips._id,n_trips_over_2)
    half_ebike_trips = np.concatenate((ebike_trip_list, other_trip_list))

    # Construct a dataframe the size of ceo with 50% of the trips being ebike
    half_ebike_trips_idx = df[df._id.isin(half_ebike_trips)].index
    return df.loc[half_ebike_trips_idx]


mostly_ebike_df = hf.construct_mostly_ebike_df(energy_consumption_df)

In [None]:
user_spatial_cov_map, _ = get_EC.get_user_spatial_cov_map(mostly_ebike_df, estimation_method='expected')

In [None]:
program_n_sd_map = hf.plot_estimates_with_sd_by_program(mostly_ebike_df, os_EI_moments_map, unit_dist_MCS_df, variance_method='spatial_cov', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

In [None]:
program_n_sd_map = hf.plot_estimates_with_sd_by_program(mostly_ebike_df,os_EI_moments_map, unit_dist_MCS_df, variance_method='aggregate_distance', user_spatial_cov_map = user_spatial_cov_map)
print(program_n_sd_map)

## Looking for cases where mode inference errors are repeated.

#### Let's start by looking for ebike mispredicted as car.

In [None]:
ebike_but_predicted_as_car = energy_consumption_df[(energy_consumption_df.mode_confirm == 'pilot_ebike') & (energy_consumption_df.primary_mode == 'car')]

In [None]:
cluster_id = ebike_but_predicted_as_car.cluster_id.iloc[0]
ebike_but_predicted_as_car[ebike_but_predicted_as_car.cluster_id == cluster_id][['mode_confirm','primary_mode','distance_miles','expected','user_labeled']]
# even though these were the same mistake in the same cluster, the trips were different distances and so the energy consumption estimates were not very similar.

In [None]:
# Lets try looking at a larger cluster. Turns out that large cluster does not guarantee a lot of repeated ebike car mistakes.
large_clusters = ebike_but_predicted_as_car[ebike_but_predicted_as_car.cluster_size > 50].cluster_id
ebike_but_predicted_as_car[ebike_but_predicted_as_car.cluster_id == large_clusters.iloc[1]][['mode_confirm','primary_mode','distance_miles','expected','user_labeled']]

#### Let's pick a cluster and look for common mistakes.

In [None]:
large_clusters = energy_consumption_df[energy_consumption_df.cluster_size > 10].cluster_id
energy_consumption_df[energy_consumption_df.cluster_id == large_clusters.iloc[0]][['mode_confirm','primary_mode','distance_miles','expected','predicted','user_labeled']]

In [None]:
large_clusters = energy_consumption_df[energy_consumption_df.cluster_size > 10].cluster_id
energy_consumption_df[(energy_consumption_df.cluster_id == large_clusters.iloc[0]) & (energy_consumption_df.primary_mode == 'car')][['mode_confirm','primary_mode','distance_miles','expected','predicted','user_labeled']]

In [None]:
# lots of not a trips.
large_clusters = energy_consumption_df[energy_consumption_df.cluster_size > 50].cluster_id
cols_of_interest = ['mode_confirm','primary_mode','distance_miles','expected','predicted','user_labeled']
energy_consumption_df[(energy_consumption_df.cluster_id == large_clusters.iloc[13000]) & (energy_consumption_df.primary_mode == 'no_sensed')][cols_of_interest]

In [None]:
def calculate_LISAs(df, col_of_interest):
    '''
    Outputs a Local Indicator of Spatial Association (LISA) for the col_of_interest. Specifically, it finds local Moran's I for each trip.
    See https://geographicdata.science/book/notebooks/07_local_autocorrelation.html for details. 
    The weights are based on cluster membership. If a trip is in the neighborhood of another (aka in the same cluster),
    the spatial weight is 1. Otherwise it is 0. Cluster membership is seen in the cluster_id or in the trip_neighbors columns.

    df: a trips dataframe that already has assigned clusters for each trip id.
    col_of_interest: a string label for the column to find spatial covariance from.
        eg, use 'expected' for sensing expected energy consumption estimates.

    Returns: a float representing the spatial autocovariance of the variable. 
        (I think) it should be between -1*v and 1*v, where v is the variance of the variable.
    '''
    n = len(df)
    xbar = np.mean(df[col_of_interest])
    var_x = np.var(df[col_of_interest])
    cov_sum = 0
    W_sum = 0 

    local_I = []

    for i,trip in df.iterrows():
            #if trip['cluster_size'] == 1: 
            #    local_I.append(0)
            #else:
            #neighbor_list = list(trip['trip_neighbors']) 
            # sometimes neighbor list will have neighbors that are not in the timeframe of interest, but neighbors_df will only have the trips of interest.
            neighbors_df = df[(df.cluster_id == trip['cluster_id']) & (df._id != trip['_id'])]#neighbors_df = energy_consumption_df[energy_consumption_df['_id'].isin(neighbor_list)].copy()

            if len(neighbors_df) == 0:
                local_I.append(0)
            else:
                # for each neighbor, multiply trip i's deviation from the mean by the neighbors deviation from the mean.
                # then sum:  ( sum_{j=1:n} w_{ij} (x_i - xbar)(x_j - xbar) )
                trip_i_deviation = trip[col_of_interest] - xbar   # scalar
                neighbor_deviations = neighbors_df[col_of_interest] - xbar  # array
                #print(trip_i_deviation)
                #print(neighbor_deviations)

                w = 1/len(neighbors_df) # normalize the weights
                local_I.append(w*sum(trip_i_deviation*neighbor_deviations)/var_x)

    return local_I

In [None]:
large_clusters = energy_consumption_df[energy_consumption_df.cluster_size > 50].cluster_id

no_sensed_user = energy_consumption_df[(energy_consumption_df.cluster_id == large_clusters.iloc[13000])].user_id.iloc[0]
trips = energy_consumption_df[energy_consumption_df.user_id == no_sensed_user].copy()
LISAs_for_no_sensed_user = calculate_LISAs(trips,'expected')

trips['local_I'] = LISAs_for_no_sensed_user
trips[trips.cluster_id == large_clusters.iloc[13000]][cols_of_interest + ['cluster_id','local_I']] 
#trips[trips.local_I > 10].cluster_id

#spatial_autocov_based_on_clusters(trips,'expected')

In [None]:
trips.cluster_size.hist(); plt.show()

In [None]:
trips.local_I.hist(bins=40); plt.show()

In [None]:
local_I_expected = calculate_LISAs(energy_consumption_df[energy_consumption_df.program == 'vail'],'expected')
local_I_predicted = calculate_LISAs(energy_consumption_df[energy_consumption_df.program == 'vail'],'predicted')
local_I_user_labeled = calculate_LISAs(energy_consumption_df[energy_consumption_df.program == 'vail'],'user_labeled')

### A look at the distribution of local I values that we get.

In [None]:
import seaborn
ax = seaborn.kdeplot(local_I_expected)
seaborn.rugplot(local_I_expected, ax=ax) # adds bars for each observation

In [None]:
ax = seaborn.kdeplot(local_I_user_labeled)
seaborn.rugplot(local_I_user_labeled, ax=ax) # adds bars for each observation

### Sanity check for the spatial autocorrelation with synthetic data.


In [None]:
# columns: trip id, cluster id, primary mode, distance, energy consumption
# make a dataframe.
n_trips = 10**3
n_work_trips = int(0.70*n_trips)
n_grocery_trips = int(0.20*n_trips)
n_friend_trips = int(0.10*n_trips)

work_trip_length = 10 # miles
grocery_trip_length = 5
friend_trip_length = 20

fake_trips_df = pd.DataFrame({
    "_id": list(range(n_trips)),
    "cluster_id": ["0"]*n_work_trips + ["1"]*n_grocery_trips + ["2"]*n_friend_trips,
    "primary_mode": ["car"]*n_trips,
    "distance": [work_trip_length]*n_work_trips + [grocery_trip_length]*n_grocery_trips + [friend_trip_length]*n_friend_trips
})

r = 1 
car_load_factor = (r+1)/(r+0.5)
drove_alone_EI = energy_dict["Gas Car, drove alone"]
energy_dict.update({"Car, sensed": drove_alone_EI/car_load_factor})

fake_trips_df['predicted_EC'] = energy_dict['Car, sensed']*fake_trips_df['distance']
fake_trips_df['expected_EC'] = 1.1*fake_trips_df['distance']

spatial_autocov_based_on_clusters(fake_trips_df,'expected_EC', print_statistics=True)
spatial_autocov_based_on_clusters(fake_trips_df,'predicted_EC', print_statistics=True)
print(f"variance for predicted EC: {np.var(fake_trips_df.predicted_EC)}")

In [None]:
fake_trips_df['local_I'] = calculate_LISAs(fake_trips_df,'predicted_EC')
fake_trips_df[fake_trips_df.cluster_id == "2"]

In [None]:
# columns: trip id, cluster id, primary mode, distance, energy consumption
# make a dataframe.
n_trips = 10**3
n_work_trips = int(0.50*n_trips)
n_grocery_trips = int(0.30*n_trips)
n_friend_trips = int(0.20*n_trips)

work_trip_length = 10 # miles
grocery_trip_length = 5
friend_trip_length = 20

fake_trips_df = pd.DataFrame({
    "_id": list(range(n_trips)),
    "cluster_id": ["0"]*n_work_trips + ["1"]*n_grocery_trips + ["2"]*n_friend_trips,
    "primary_mode": ["car"]*n_trips,
    "distance": [work_trip_length]*n_work_trips + [grocery_trip_length]*n_grocery_trips + [friend_trip_length]*n_friend_trips
})

fake_trips_df['predicted_EC'] = energy_dict['Car, sensed']*fake_trips_df['distance']
spatial_autocov_based_on_clusters(fake_trips_df,'predicted_EC', print_statistics=True)
print(f"variance for predicted EC: {np.var(fake_trips_df.predicted_EC)}")

In [None]:
# columns: trip id, cluster id, primary mode, distance, energy consumption
# make a dataframe.
n_trips = 10**3

n_work_trips = int(0.50*n_trips)
n_grocery_trips = int(0.30*n_trips)
n_friend_trips = int(0.10*n_trips)
n_recreation_trips = int(0.10*n_trips)

work_trip_length = 10 # miles
grocery_trip_length = 5
friend_trip_length = 20
recreation_trip_length = 40

fake_trips_df = pd.DataFrame({
    "_id": list(range(n_trips)),
    "cluster_id": ["0"]*n_work_trips + ["1"]*n_grocery_trips + ["2"]*n_friend_trips + ["3"]*n_recreation_trips,
    "primary_mode": ["Car, sensed"]*n_work_trips + ["Pilot ebike"]*n_grocery_trips + ["Car, sensed"]*n_friend_trips + ["Car, sensed"]*n_recreation_trips,
    "distance": [work_trip_length]*n_work_trips + [grocery_trip_length]*n_grocery_trips + [friend_trip_length]*n_friend_trips + [recreation_trip_length]*n_recreation_trips
})

fake_trips_df['predicted_EC'] = fake_trips_df.primary_mode.map(energy_dict)*fake_trips_df['distance']
spatial_autocov_based_on_clusters(fake_trips_df,'predicted_EC', print_statistics=True)
print(f"variance for predicted EC: {np.var(fake_trips_df.predicted_EC)}")

In [None]:
# If highly homogeneous:
# columns: trip id, cluster id, primary mode, distance, energy consumption
# make a dataframe.
n_trips = 10**3

n_work_trips = int(0.90*n_trips)
n_grocery_trips = int(0.05*n_trips)
n_friend_trips = int(0.05*n_trips)
n_recreation_trips = int(0*n_trips)

work_trip_length = 10 # miles
grocery_trip_length = 5
friend_trip_length = 20
recreation_trip_length = 40

fake_trips_df = pd.DataFrame({
    "_id": list(range(n_trips)),
    "cluster_id": ["0"]*n_work_trips + ["1"]*n_grocery_trips + ["2"]*n_friend_trips + ["3"]*n_recreation_trips,
    "primary_mode": ["Car, sensed"]*n_work_trips + ["Pilot ebike"]*n_grocery_trips + ["Car, sensed"]*n_friend_trips + ["Car, sensed"]*n_recreation_trips,
    "distance": [work_trip_length]*n_work_trips + [grocery_trip_length]*n_grocery_trips + [friend_trip_length]*n_friend_trips + [recreation_trip_length]*n_recreation_trips
})

fake_trips_df['predicted_EC'] = fake_trips_df.primary_mode.map(energy_dict)*fake_trips_df['distance']
spatial_autocov_based_on_clusters(fake_trips_df,'predicted_EC', print_statistics=True)
print(f"variance for predicted EC: {np.var(fake_trips_df.predicted_EC)}")

In [None]:
# columns: trip id, cluster id, primary mode, distance, energy consumption
# make a dataframe.
n_trips = 10**3

n_work_trips = int(0.30*n_trips)
n_grocery_trips = int(0.50*n_trips)
n_friend_trips = int(0.10*n_trips)
n_recreation_trips = int(0.10*n_trips)

work_trip_length = 10 # miles
grocery_trip_length = 5
friend_trip_length = 20
recreation_trip_length = 40

fake_trips_df = pd.DataFrame({
    "_id": list(range(n_trips)),
    "cluster_id": ["0"]*n_work_trips + ["1"]*n_grocery_trips + ["2"]*n_friend_trips + ["3"]*n_recreation_trips,
    "primary_mode": ["Car, sensed"]*n_work_trips + ["Pilot ebike"]*n_grocery_trips + ["Car, sensed"]*n_friend_trips + ["Car, sensed"]*n_recreation_trips,
    "distance": [work_trip_length]*n_work_trips + [grocery_trip_length]*n_grocery_trips + [friend_trip_length]*n_friend_trips + [recreation_trip_length]*n_recreation_trips
})

fake_trips_df['predicted_EC'] = fake_trips_df.primary_mode.map(energy_dict)*fake_trips_df['distance']
spatial_autocov_based_on_clusters(fake_trips_df,'predicted_EC', print_statistics=True)
print(f"variance for predicted EC: {np.var(fake_trips_df.predicted_EC)}")

In [None]:
main_mode_confirms = ['drove_alone','shared_ride','walk','pilot_ebike','bus','bike','train','taxi','free_shuttle', 'not_a_trip']
main_modes_df = expanded_labeled_trips[expanded_labeled_trips.mode_confirm.isin(main_mode_confirms)].copy()
main_modes_df = main_modes_df[main_modes_df.mode_confirm.notna()]

match_count = 0
for _,ct in main_modes_df.iterrows():
    if (ct['primary_mode'] == 'car') and (ct['mode_confirm'] in ['shared_ride', 'taxi']):
        match_count += 1
    elif (ct['primary_mode'] == 'bicycling') and (ct['mode_confirm'] == 'pilot_ebike'):
        match_count += 1
    elif (ct['primary_mode'] == 'bus') and (ct['mode_confirm'] == 'free_shuttle'):
        match_count += 1
    elif MODE_MAPPING_DICT[ct['primary_mode']] == MODE_MAPPING_DICT[ct['mode_confirm']]:
        match_count += 1