# Post Calibration
In this we look at the quality of the calibrated impedance functions.
---
1. Recalculate shortest path results using the calibrated coefficients
1. Calculate objective functions and other performance metrics
1. Get route attribute summary for the chosen/shortest/modeled routes (right now)


Still Working On
1. Look at where calibrated function did the best/worst job for both the training/testing set
1. Cluster/segment results based on loss function value?
4. Export for application in BikewaySim

In [None]:
from pathlib import Path
import time
import pandas as pd
import geopandas as gpd
import numpy as np
import pickle
import networkx as nx
from stochopy.optimize import minimize
from tqdm import tqdm
import similaritymeasures
import random
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
from shapely.ops import LineString, MultiLineString

from bikewaysim.paths import config
from bikewaysim.network import modeling_turns
from bikewaysim.impedance_calibration import stochastic_optimization, speedfactor


# 

# Import Relevant Files

In [None]:
links, turns, length_dict, geo_dict, turn_G = stochastic_optimization.import_calibration_network(config)
with (config['calibration_fp']/'ready_for_calibration.pkl').open('rb') as fh:
    full_set = pickle.load(fh)
full_ods = stochastic_optimization.match_results_to_ods(full_set)

Get shortest and chosen stats first

In [None]:
for key, item in full_set.items():
    
    chosen = item['matched_edges'].values
    shortest = item['shortest_edges'].values

    test0 = {
        'chosen_length': round(np.array([length_dict.get(tripid[0],0) for tripid in chosen]).sum()/5280,2),
        'shortest_length': round(np.array([length_dict.get(tripid[0],0) for tripid in shortest]).sum()/5280,2),
        'chosen_detour': round(stochastic_optimization.detour_factor(chosen,shortest,length_dict),2),
        'shortest_jaccard': round(stochastic_optimization.jaccard_index_func(chosen,shortest,length_dict),2),
        'shortest_buffer': round(stochastic_optimization.buffer_overlap(chosen,shortest,geo_dict),2),
    }
    full_set[key].update(test0)

# export new version
with (config['calibration_fp']/'ready_for_calibration_stats.pkl').open('wb') as fh:
    pickle.dump(full_set,fh)

In [None]:
# get all of the current calibration results
calibration_result_fps = list((config['calibration_fp']/"calibration_results").glob('*.pkl'))

# pick a specific model
specific_model = 'calibration2_new'
calibration_result_fps = [x for x in calibration_result_fps if specific_model == x.stem.split('(')[0].strip()]

In [None]:
beta_vals = {}
for idx, calibration_result_fp in enumerate(calibration_result_fps):
    with calibration_result_fp.open('rb') as fh:
        calibration_result = pickle.load(fh)
    beta_vals[calibration_result_fp.stem] = {x['col']:x['beta'] for x in calibration_result['betas_tup']}
beta_vals = pd.DataFrame().from_dict(beta_vals,orient='index')
# beta_vals.std()

In [None]:
beta_vals = {}

from importlib import reload
reload(stochastic_optimization)

#NOTE TEMP
calibration_result_fps = [calibration_result_fps[0]]

for idx, calibration_result_fp in enumerate(calibration_result_fps):
    print('Calibration result',idx+1,'out of',len(calibration_result_fps))
    with calibration_result_fp.open('rb') as fh:
        calibration_result = pickle.load(fh)

    beta_vals[calibration_result_fp.stem] = {x['col']:x['beta'] for x in calibration_result['betas_tup']}

    base_impedance_col = "travel_time_min"
    betas = [x['beta'] for x in calibration_result['betas_tup']]
    print(betas)
    stochastic_optimization.back_to_base_impedance(base_impedance_col,links,turns,turn_G)
    stochastic_optimization.impedance_update(betas,calibration_result['betas_tup'],
                            stochastic_optimization.link_impedance_function,
                            base_impedance_col,
                            stochastic_optimization.turn_impedance_function,
                            links,turns,turn_G)
    modeled_results_sp = {(start_node,end_node):stochastic_optimization.impedance_path(turns,turn_G,links,start_node,end_node) for start_node, end_node in tqdm(full_ods,total=len(full_ods))}
    modeled_results_dict = {}
    for tripid, item in full_set.items():
        chosen = item['matched_edges'].values
        shortest = item['shortest_edges'].values
        od = (item['origin_node'],item['destination_node'])
        modeled = modeled_results_sp[od]['edge_list']
    
        modeled_results_dict[tripid] = {
            'modeled_edges': pd.DataFrame(modeled,columns=['linkid','reverse_link']),
            'modeled_length': round(np.array([length_dict.get(tripid[0],0) for tripid in modeled]).sum()/5280,1),
            'modeled_detour': round(stochastic_optimization.detour_factor(modeled,shortest,length_dict),2),
            'modeled_jaccard': round(stochastic_optimization.jaccard_index_func(chosen,modeled,length_dict),2),
            'modeled_buffer': round(stochastic_optimization.buffer_overlap(chosen,modeled,geo_dict),2),
            'test': stochastic_optimization.jaccard_index_total(chosen,modeled,length_dict) # returns tuple
        }
    
    jaccard_mean = np.array([item['modeled_jaccard'] for tripid, item in modeled_results_dict.items()]).mean()
    buffer_mean = np.array([item['modeled_buffer'] for tripid, item in modeled_results_dict.items()]).mean()
    jaccrd_total = np.array([item['test'] for tripid, item in modeled_results_dict.items()])
    jaccrd_total = jaccrd_total[:,0].sum() / jaccrd_total[:,1].sum()
    jaccard_mean2 = np.sum(jaccrd_total[:,0] / jaccrd_total[:,1]) / jaccrd_total.shape[0]
    print(jaccard_mean,jaccard_mean2)
    
    print('Mean Jaccard Index',round(jaccard_mean,2),'Mean Buffer',round(buffer_mean,2))
    with (config['calibration_fp']/'post_calibration'/(calibration_result_fp.stem+'.pkl')).open('wb') as fh:
        pickle.dump(modeled_results_dict,fh)

# get fit stats
# TODO get the length weighted versions
shortest_jaccard_mean = np.array([item['shortest_jaccard'] for tripid, item in full_set.items()]).mean()
shortest_buffer_mean = np.array([item['shortest_buffer'] for tripid, item in full_set.items()]).mean()
print('Mean Jaccard Index',round(shortest_jaccard_mean,2),'Mean Buffer',round(shortest_buffer_mean,2))

shortest_jaccard = {tripid:item['shortest_jaccard'] for tripid, item in full_set.items()}
shortest_jaccard = pd.Series(shortest_jaccard)
shortest_jaccard.name = 'shortest_jaccard'

In [None]:
jaccard_mean

In [None]:
jaccrd_total

In [None]:
x = np.array([(3,4),(3,4)])
x[:,0].sum() / x[:,1].sum()

In [None]:
x.shape

In [None]:
# sum of all of the intersects intersect of chosen and modeled / sum of all of the unions

In [None]:
post_calibration_result_fps = list((config['calibration_fp']/"post_calibration").glob('*.pkl'))

# pick a specific model
specific_model = 'calibration2_new'
calibration_result_fps = [x for x in calibration_result_fps if specific_model == x.stem.split('(')[0].strip()]
post_calibration_result_fps

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
#aggregated results and overlap figures
mean_values = []
for idx, post_calibration_result_fp in enumerate(post_calibration_result_fps):
    print('Calibration result',post_calibration_result_fp.stem)
    with post_calibration_result_fp.open('rb') as fh:
        post_calibration_result = pickle.load(fh)

    jaccard_mean = np.array([item['modeled_jaccard'] for tripid, item in post_calibration_result.items()]).mean()
    buffer_mean = np.array([item['modeled_buffer'] for tripid, item in post_calibration_result.items()]).mean()
    mean_values.append((post_calibration_result_fp.stem,round(jaccard_mean,2),round(buffer_mean,2)))

    print('Mean Jaccard Index',round(jaccard_mean,2),'Mean Buffer',round(buffer_mean,2))
    # with (config['calibration_fp']/'post_calibration'/(calibration_result_fp.stem+'.pkl')).open('wb') as fh:
    #     pickle.dump(modeled_results_dict,fh)

    #get just jaccard values
    modeled_jaccard = {tripid:item['modeled_jaccard'] for tripid, item in post_calibration_result.items()}
    modeled_jaccard = pd.Series(modeled_jaccard)
    modeled_jaccard.name = 'modeled_jaccard'
    df = pd.concat([shortest_jaccard,modeled_jaccard],axis=1,ignore_index=False)

    # make figures
    # Create the histogram
    plt.figure(figsize=(12, 12))
    plt.hist(df['shortest_jaccard'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
    plt.hist(df['modeled_jaccard'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

    # Adding labels, title, and legend with font size adjustments
    plt.xlabel('Overlap', fontsize=22)
    plt.ylabel(f'Frequency (N={df.shape[0]})', fontsize=22)
    plt.title(post_calibration_result_fp.stem, fontsize=16)
    plt.legend(title='Jaccard Index', fontsize=22, title_fontsize=22)
    plt.ylim([0,700])

    # Adjusting the font size of the tick labels
    plt.xticks(fontsize=22)
    plt.yticks(fontsize=22)

    # Show the plot
    plt.savefig(config['calibration_fp']/'calibration_performance'/(post_calibration_result_fp.stem + '.png'))


In [None]:
calibration_result['betas_tup']

In [None]:
metrics = pd.DataFrame(mean_values,columns=['calibration_name','jaccard_mean','buffer_mean'])
metrics.set_index('calibration_name',inplace=True)
metrics.sort_values('jaccard_mean',ascending=False)

In [None]:
metrics.sort_values('buffer_mean',ascending=False)

In [None]:
beta_vals

# Old stuff past here

In [None]:
# get the betas too

In [None]:
# import pandas as pd
# #loss_data = pd.DataFrame({'loss_shortest_full':loss_shortest_full,'loss_full':loss_full})

# import matplotlib.pyplot as plt
# # Create the histogram
# #ax = loss_full_df.plot.hist(stacked=True, bins=20, figsize=(12, 12), color=['grey', 'lightgrey'])

# #get just jaccard values
# shortest_jaccard = {tripid:item['shortest_jaccard'] for tripid, item in full_set.items()}
# modeled_jaccard = {tripid:item['modeled_jaccard'] for tripid, item in post_calibration_result.items()}
# shortest_jaccard = pd.Series(shortest_jaccard)
# shortest_jaccard.name = 'shortest_jaccard'
# modeled_jaccard = pd.Series(modeled_jaccard)
# modeled_jaccard.name = 'modeled_jaccard'

# df = pd.concat([shortest_jaccard,modeled_jaccard],axis=1,ignore_index=False)

# # Create the histogram
# plt.figure(figsize=(12, 12))
# plt.hist(df['shortest_jaccard'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
# plt.hist(df['modeled_jaccard'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

# # Adding labels, title, and legend with font size adjustments
# plt.xlabel('Overlap', fontsize=22)
# plt.ylabel(f'Frequency (N={df.shape[0]})', fontsize=22)
# #plt.title('Histogram of Training Losses', fontsize=16)
# plt.legend(title='Jaccard Index', fontsize=22, title_fontsize=22)

# # Adjusting the font size of the tick labels
# plt.xticks(fontsize=22)
# plt.yticks(fontsize=22)

# # Show the plot
# plt.show()

In [None]:
# #get all the coefficient values in one dataframe that have
# cols = ['multi use path','bike lane','lanes','above_4','unsig_major_road_crossing']

# # each row is a calibration result, each column corresponds to a beta
# constr = []
# for idx, results in enumerate(calibration_result):
#     results = {x['col']:x['beta'] for x in results['betas_tup']}
#     x = set(list(results.keys()))
#     y = set(cols)
#     cond1 = len(set.union(x,y)) == len(cols)
#     cond2 = len(set.intersection(x,y)) == len(cols)
#     if cond1 & cond2:
#         results['idx'] = idx
#         constr.append(results)
# beta_vals = pd.DataFrame.from_records(constr)
# beta_vals.set_index('idx',inplace=True)
# beta_vals

In [None]:
#BUG the calibration process is not retrieving the correct coefficients anymore
# betas = [-0.184,-0.398,0.126,0.325,0.324]

Re-calculate shortest path results for all the modeleling attempts so that we can get standardized performance metrics

In [None]:
to_add = {}

for idx,calibration_result in enumerate(calibration_results):

    base_impedance_col = "travel_time_min"
    betas = [x['beta'] for x in calibration_result['betas_tup']]
    print(betas)
    stochastic_optimization.back_to_base_impedance(base_impedance_col,links,turns,turn_G)
    stochastic_optimization.impedance_update(betas,calibration_result['betas_tup'],
                            stochastic_optimization.link_impedance_function,
                            base_impedance_col,
                            stochastic_optimization.turn_impedance_function,
                            links,turns,turn_G)
    modeled_results = {(start_node,end_node):stochastic_optimization.impedance_path(turns,turn_G,links,start_node,end_node) for start_node, end_node in full_ods}

    #TODO so can this
    #add a new modeled edges field so that we can calculate the modeled edges entry
    for tripid, item in full_set.items():
        od = (item['origin_node'],item['destination_node'])
        modeled_edges = modeled_results[od]['edge_list']
        #turn to dataframe
        modeled_edges = pd.DataFrame(modeled_edges,columns=['linkid','reverse_link'])
        full_set[tripid].update({f"modeled_edges_{idx}":modeled_edges})

    #TODO can also probably be a function
    from importlib import reload
    reload(stochastic_optimization)
    test = {}

    geo_dict = dict(zip(links['linkid'],links.geometry))
    length_dict = dict(zip(links['linkid'],links.length))

    for key, item in full_set.items():
        
        chosen = item['matched_edges'].values
        shortest = item['shortest_edges'].values
        modeled = item[f"modeled_edges_{idx}"].values

        test0 = {
            #lengths
            'chosen_length': round(np.array([length_dict.get(tripid[0],0) for tripid in chosen]).sum()/5280,2),
            'shortest_length': round(np.array([length_dict.get(tripid[0],0) for tripid in shortest]).sum()/5280,2),
            'modeled_length': round(np.array([length_dict.get(tripid[0],0) for tripid in modeled]).sum()/5280,2),
            # detour
            'chosen_detour': round(stochastic_optimization.detour_factor(chosen,shortest,length_dict),2),
            'modeled_detour': round(stochastic_optimization.detour_factor(modeled,shortest,length_dict),2),
            # jaccard
            'shortest_jaccard': round(stochastic_optimization.jaccard_index_func(chosen,shortest,length_dict),2),
            'modeled_jaccard': round(stochastic_optimization.jaccard_index_func(chosen,modeled,length_dict),2),
            # buffer
            'shortest_buffer': round(stochastic_optimization.buffer_overlap(chosen,shortest,geo_dict),2),
            'modeled_buffer': round(stochastic_optimization.buffer_overlap(chosen,modeled,geo_dict),2),
            # frechet
            'shortest_frechet': round(stochastic_optimization.frechet_distance(chosen,shortest,geo_dict),2),
            'modeled_frechet': round(stochastic_optimization.frechet_distance(chosen,modeled,geo_dict),2)
        }
        test[key] = test0

    results_df = pd.DataFrame.from_dict(test,orient='index')
    
    to_add[idx] = results_df.mean().to_dict()

In [None]:
pd.DataFrame.from_dict(to_add,orient='index')

In [None]:
# base_impedance_col = "travel_time_min"
# betas = [x['beta'] for x in calibration_result['betas_tup']]
# print(betas)
# stochastic_optimization.back_to_base_impedance(base_impedance_col,links,turns,turn_G)
# stochastic_optimization.impedance_update(betas,calibration_result['betas_tup'],
#                           stochastic_optimization.link_impedance_function,
#                           base_impedance_col,
#                           stochastic_optimization.turn_impedance_function,
#                           links,turns,turn_G)
# modeled_results = {(start_node,end_node):stochastic_optimization.impedance_path(turns,turn_G,links,start_node,end_node) for start_node, end_node in full_ods}

# #TODO so can this
# #add a new modeled edges field so that we can calculate the modeled edges entry
# for tripid, item in full_set.items():
#     od = (item['origin_node'],item['destination_node'])
#     modeled_edges = modeled_results[od]['edge_list']
#     #turn to dataframe
#     modeled_edges = pd.DataFrame(modeled_edges,columns=['linkid','reverse_link'])
#     full_set[tripid].update({'modeled_edges':modeled_edges})

In [None]:
with (config['calibration_fp']/"full_modeled_results.pkl").open('wb') as fh:
    pickle.dump(full_set,fh)

# Calculate all the possible overlap metrics

In [None]:
with (config['calibration_fp']/"full_modeled_results.pkl").open('rb') as fh:
    full_set = pickle.load(fh)

In [None]:
#TODO can also probably be a function
from importlib import reload
reload(stochastic_optimization)
test = {}

geo_dict = dict(zip(links['linkid'],links.geometry))
length_dict = dict(zip(links['linkid'],links.length))

for key, item in full_set.items():
    
    chosen = item['matched_edges'].values
    shortest = item['shortest_edges'].values
    modeled = item['modeled_edges'].values

    test0 = {
        #lengths
        'chosen_length': round(np.array([length_dict.get(tripid[0],0) for tripid in chosen]).sum()/5280,2),
        'shortest_length': round(np.array([length_dict.get(tripid[0],0) for tripid in shortest]).sum()/5280,2),
        'modeled_length': round(np.array([length_dict.get(tripid[0],0) for tripid in modeled]).sum()/5280,2),
        # detour
        'chosen_detour': round(stochastic_optimization.detour_factor(chosen,shortest,length_dict),2),
        'modeled_detour': round(stochastic_optimization.detour_factor(chosen,modeled,length_dict),2),
        # jaccard
        'shortest_jaccard': round(stochastic_optimization.jaccard_index_func(chosen,shortest,length_dict),2),
        'modeled_jaccard': round(stochastic_optimization.jaccard_index_func(chosen,modeled,length_dict),2),
        # buffer
        'shortest_buffer': round(stochastic_optimization.buffer_overlap(chosen,shortest,geo_dict),2),
        'modeled_buffer': round(stochastic_optimization.buffer_overlap(chosen,modeled,geo_dict),2),
        # frechet
        'shortest_frechet': round(stochastic_optimization.frechet_distance(chosen,shortest,geo_dict),2),
        'modeled_frechet': round(stochastic_optimization.frechet_distance(chosen,modeled,geo_dict),2)
    }
    test[key] = test0

In [None]:
results_df = pd.DataFrame.from_dict(test,orient='index')

In [None]:
results_df[results_df['chosen_detour']<1].hist('chosen_detour',bins=10)

First 5 mins/first 30%
If you deviate from the shortest path do you eventually come back or not?


In [None]:
results_df[results_df['chosen_detour']>1]

In [None]:
# we want to add additional rows to find where they performed best

In [None]:
results_df.mean()

Add route attributes

In [None]:
import impedance_calibration.src.summarize_route as summarize_route
reload(summarize_route)

chosen_attr = {tripid:summarize_route.route_attributes1(tripid,full_set[tripid]['matched_edges'].values,links,turns) for tripid in full_set.keys()}
shortest_attr = {tripid:summarize_route.route_attributes1(tripid,full_set[tripid]['shortest_edges'].values,links,turns) for tripid in full_set.keys()} 
modeled_attr = {tripid:summarize_route.route_attributes1(tripid,full_set[tripid]['modeled_edges'].values,links,turns) for tripid in full_set.keys()}

In [None]:
chosen_attr = pd.DataFrame.from_dict(chosen_attr,orient='index')
shortest_attr = pd.DataFrame.from_dict(shortest_attr,orient='index')
modeled_attr = pd.DataFrame.from_dict(modeled_attr,orient='index')

In [None]:
results_df.reset_index().to_csv(config['calibration_fp']/'objective_functions.csv',index=False)

In [None]:
results_df = pd.read_csv(config['calibration_fp']/'objective_functions.csv')

In [None]:
results_df.mean().drop('index').to_dict()

# Segmenting/labeling trips
Want to know which attributes help lead to a loss function value. 

## Tree #1
Want to see if percent detour or if any of the chosen route characterstics (that were accounted for in the calibration process) contributed to the overlap value (modeled_jaccard)

In [None]:
tree_df = pd.concat([results_df[['chosen_length','chosen_detour','modeled_jaccard']],chosen_attr],axis=1)

tree_df['lanes_above_1'] = tree_df['lane_2'] + tree_df['lane_3']
tree_df.drop(columns=['lane_0','lane_1','lane_2','lane_3'],inplace=True)

First split is the chosen detour rate. 1,377 trips (over half) have a detour rate above 10.5% and this is associated with a bigger reduction in overlap. Past this it appears that trips that took a lot 

Second split is on chosen length and chosen detour rate. 

In [None]:
from sklearn.tree import plot_tree
from sklearn import tree

X, y = tree_df.drop(columns=['modeled_jaccard']).values, tree_df['modeled_jaccard'].values
clf = tree.DecisionTreeRegressor(max_depth=3,min_samples_split=50)
clf = clf.fit(X, y)
fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
plot_tree(clf, feature_names=tree_df.drop(columns=['modeled_jaccard']).columns, filled=True, ax=ax)
plt.show()

In [None]:
#normal regression
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

X, y = tree_df.drop(columns=['modeled_jaccard']), tree_df['modeled_jaccard']
X = sm.add_constant(X)
model_sm = sm.OLS(y, X).fit()
print(model_sm.summary())

In [None]:
tree_df.sort_values('unsig_major_road_crossing',ascending=False).head(40)

In [None]:
X, y = one_hot_data.values, merged['impedance'].values
clf = tree.DecisionTreeRegressor(max_depth=5)
clf = clf.fit(X, y)
fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
plot_tree(clf, feature_names=one_hot_data.columns, filled=True, ax=ax)
plt.show()
how to compare predicted values vs actual?
SS_res = ((y - clf.predict(X))**2).sum()
SS_tot = ((y - y.mean())**2).sum()
R2 = 1 - (SS_res/SS_tot)
R2

In [None]:
# with (config['calibration_fp']/"full_modeled_results.pkl").open('rb') as fh:
#     modeled_results = pickle.load(fh)
## Add Route Attributes
import impedance_calibration.src.summarize_route as summarize_route
cols_to_summarize = {
    'facility_fwd': "category",
    'AADT': ("threshold",[10000]),
    'truck_pct': ("threshold",[5]),
    'speed': "category",
    'lanes': "category",
    'mixed_traffic_no_facil': "boolean",
    'mixed_traffic_w_facil': "boolean"
}
links.set_index(['linkid','reverse_link'],inplace=True)
turns.set_index(['source_linkid','source_reverse_link','target_linkid','target_reverse_link'],inplace=True)
links.columns
# #unit conversions
#links['length_mi'] = (links['length_ft'] / 5280).round(2)
#links['ascent_ft'] = (links['ascent_m'] * 3.28084).round(0)
#links.drop(columns=['length_ft','ascent_m'],inplace=True)
test_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in full_set.items()]
test_summary = summarize_route.procees_summary_results(test_summary,config['projected_crs_epsg'])
test_summary.drop(columns=['tripid','geometry']).describe()
train_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in train_set.items()]
train_summary = summarize_route.procees_summary_results(train_summary,config['projected_crs_epsg'])
train_summary.drop(columns=['tripid','geometry']).describe()
train_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in train_set.items()]
train_summary = summarize_route.procees_summary_results(train_summary,config['projected_crs_epsg'])
train_summary.drop(columns=['tripid','geometry']).describe()
summary = pd.concat([test_summary,train_summary],ignore_index=True)
# summary.to_file(config['calibration_fp']/"route_attributes.gpkg",layer='modeled')

In [None]:
results_df

## Calculate the percent change in impedance at the link level for visualization

In [None]:
links.columns

In [None]:
impedance_change = links.copy()
impedance_change['imp_prop'] = (impedance_change['link_cost'] - impedance_change['travel_time_min']) / impedance_change['travel_time_min']
impedance_change['imp_prop'] = impedance_change['imp_prop'].round(3)
impedance_change = impedance_change[impedance_change['reverse_link']==False]

In [None]:
#todo automate the plot generation for when qgis isn't available
impedance_change.plot('imp_prop')

In [None]:
impedance_change.to_file(config['calibration_fp']/"network_impedance_change.gpkg")

In [None]:
impedance_change['imp_prop'].describe()

In [None]:
sorted(impedance_change['imp_prop'].unique())

In [None]:
(impedance_change['imp_prop']==0).any()

In [None]:
impedance_change.imp_prop.round(3).value_counts()

## Train results

In [None]:
# #link_impedance_col = "adj_travel_time_min"
# base_impedance_col = "travel_time_min"
# stochastic_optimization.back_to_base_impedance(base_impedance_col,links,turns,turn_G)

# #update impedances
# #betas = #past_betas[np.array(past_vals).argmin()]#x.x
# print(betas)
# stochastic_optimization.impedance_update(betas,betas_links,betas_turns,
#                           stochastic_optimization.link_impedance_function,
#                           base_impedance_col,
#                           stochastic_optimization.turn_impedance_function,
#                           links,turns,turn_G)

# #find shortest path
# train_results_dict = {(start_node,end_node):stochastic_optimization.impedance_path(turns,turn_G,links,start_node,end_node) for start_node, end_node in train_ods}

# #calulate objective function
# loss_train = loss_function(train_set,train_results_dict,**loss_function_kwargs)
# loss_train[:,1].mean()

## Test Results

In [None]:
# #link_impedance_col = "adj_travel_time_min"
# base_impedance_col = "travel_time_min"
# stochastic_optimization.back_to_base_impedance(base_impedance_col,links,turns,turn_G)

# #update impedances
# #betas = #past_betas[np.array(past_vals).argmin()]#x.x
# print(betas)
# stochastic_optimization.impedance_update(betas,betas_links,betas_turns,
#                           stochastic_optimization.link_impedance_function,
#                           base_impedance_col,
#                           stochastic_optimization.turn_impedance_function,
#                           links,turns,turn_G)

# #find shortest path
# test_results_dict = {(start_node,end_node):stochastic_optimization.impedance_path(turns,turn_G,start_node,end_node) for start_node, end_node in test_ods}

# #calulate objective function
# loss_test = loss_function(test_set,test_results_dict,**loss_function_kwargs)
# loss_test[:,1].mean()


In [None]:
# test0 = pd.DataFrame(loss_shortest_test,columns=['tripid','shortest'])
# test1 = pd.DataFrame(loss_test,columns=['tripid','impedance'])
# testing = pd.concat([test0,test1])

# train0 = pd.DataFrame(loss_shortest_train,columns=['tripid','shortest'])
# train1 = pd.DataFrame(loss_train,columns=['tripid','impedance'])
# training = pd.concat([train0,train1])

In [None]:
# #make dataframe and export results
# testing = pd.DataFrame({'tripid':list(test_set.keys()),'shortest':loss_shortest_test[:,1],'impedance':loss_test})
# testing.to_csv(config['calibration_fp']/'testing_results.csv',index=False)
# training = pd.DataFrame({'tripid':list(train_set.keys()),'shortest':loss_shortest_train[:,1],'impedance':loss_train})
# training.to_csv(config['calibration_fp']/'training_results.csv',index=False)

# Assemble dataframe of overlap metrics
Dataframe that adds in jaccard index, frechet dist, detour percent, etc into the dataframe

In [None]:
full_set[71].keys()

In [None]:
full_set[71]

In [None]:
from importlib import reload
reload(stochastic_optimization)
#calulate objective function
test = stochastic_optimization.general_objective_function(
    stochastic_optimization.frechet_distance,
    full_set,
    results_dict,
    links)
test#print(loss_full[:,1].mean().round(5))

In [None]:
full = pd.DataFrame({'tripid':list(full_set.keys()),'shortest':loss_shortest_full[:,1],'impedance':loss_full[:,1]})


In [None]:
full.to_csv(config['calibration_fp']/'training_results.csv',index=False)

# Plot distribution of overlap values

In [None]:
import pandas as pd
#loss_data = pd.DataFrame({'loss_shortest_full':loss_shortest_full,'loss_full':loss_full})

import matplotlib.pyplot as plt
# Create the histogram
#ax = loss_full_df.plot.hist(stacked=True, bins=20, figsize=(12, 12), color=['grey', 'lightgrey'])

# Create the histogram
plt.figure(figsize=(12, 12))
plt.hist(full['shortest'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
plt.hist(full['impedance'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

# Adding labels, title, and legend with font size adjustments
plt.xlabel('Overlap', fontsize=22)
plt.ylabel(f'Frequency (N={full.shape[0]})', fontsize=22)
#plt.title('Histogram of Training Losses', fontsize=16)
#plt.legend(title='Tra Overlap', fontsize=22, title_fontsize=22)

# Adjusting the font size of the tick labels
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)

# Show the plot
plt.show()

In [None]:
# import pandas as pd
# loss_data = pd.DataFrame({'loss_shortest_train':loss_shortest_train,'loss_train':loss_train})

# import matplotlib.pyplot as plt
# # Create the histogram
# #ax = loss_train_df.plot.hist(stacked=True, bins=20, figsize=(12, 12), color=['grey', 'lightgrey'])

# # Create the histogram
# plt.figure(figsize=(12, 12))
# plt.hist(loss_data['loss_shortest_train'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
# plt.hist(loss_data['loss_train'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

# # Adding labels, title, and legend with font size adjustments
# plt.xlabel('Overlap', fontsize=22)
# plt.ylabel('Frequency', fontsize=22)
# #plt.title('Histogram of Training Losses', fontsize=16)
# plt.legend(title='Training Overlap', fontsize=22, title_fontsize=22)

# # Adjusting the font size of the tick labels
# plt.xticks(fontsize=22)
# plt.yticks(fontsize=22)

# # Show the plot
# plt.show()

In [None]:
# loss_data = pd.DataFrame({'loss_shortest_test':loss_shortest_test,'loss_test':loss_test})

# import matplotlib.pyplot as plt
# # Create the histogram
# #ax = loss_train_df.plot.hist(stacked=True, bins=20, figsize=(12, 12), color=['grey', 'lightgrey'])

# # Create the histogram
# plt.figure(figsize=(12, 12))
# plt.hist(loss_data['loss_shortest_test'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
# plt.hist(loss_data['loss_test'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

# # Adding labels, title, and legend with font size adjustments
# plt.xlabel('Overlap', fontsize=22)
# plt.ylabel('Frequency', fontsize=22)
# #plt.title('Histogram of Training Losses', fontsize=16)
# plt.legend(title='Testing Overlap', fontsize=22, title_fontsize=22)

# # Adjusting the font size of the tick labels
# plt.xticks(fontsize=22)
# plt.yticks(fontsize=22)

# # Show the plot
# plt.show()

# Add modeled geometry to results dict for visualization


In [None]:
#add a new modeled edges field so that we can calculate the modeled edges entry
for tripid, item in full_set.items():
    od = (item['origin_node'],item['destination_node'])
    modeled_edges = full_results_dict.get(od,0).get('edge_list',0)
    if isinstance(modeled_edges,int):
        print(modeled_edges)
    #turn to dataframe
    modeled_edges = pd.DataFrame(modeled_edges,columns=['linkid','reverse_link'])
    full_set[tripid].update({'modeled_edges':modeled_edges})

In [None]:
# #add a new modeled edges field so that we can calculate the modeled edges entry
# for tripid, item in test_set.items():
#     od = (item['origin_node'],item['destination_node'])
#     modeled_edges = test_results_dict.get(od,0).get('edge_list',0)
#     if isinstance(modeled_edges,int):
#         print(modeled_edges)
#     #turn to dataframe
#     modeled_edges = pd.DataFrame(modeled_edges,columns=['linkid','reverse_link'])
#     test_set[tripid].update({'modeled_edges':modeled_edges})

In [None]:
# #add a new modeled edges field so that we can calculate the modeled edges entry
# for tripid, item in test_set.items():
#     od = (item['origin_node'],item['destination_node'])
#     modeled_edges = test_results_dict.get(od,0).get('edge_list',0)
#     if isinstance(modeled_edges,int):
#         print(modeled_edges)
#     #turn to dataframe
#     modeled_edges = pd.DataFrame(modeled_edges,columns=['linkid','reverse_link'])
#     test_set[tripid].update({'modeled_edges':modeled_edges})

In [None]:
#combine the test and train set dictionaries
modeled_results = {}
# modeled_results.update(train_set)
# modeled_results.update(test_set)
modeled_results.update(full_set)

In [None]:
with (config['calibration_fp']/"full_modeled_results.pkl").open('wb') as fh:
    pickle.dump(modeled_results,fh)

# QAQC (separate into a separate notebook)

In [None]:
# with (config['calibration_fp']/"full_modeled_results.pkl").open('rb') as fh:
#     modeled_results = pickle.load(fh)
# ## Add Route Attributes
# import summarize_route
# cols_to_summarize = {
#     'facility_fwd': "category",
#     'AADT': ("threshold",[10000]),
#     'truck_pct': ("threshold",[5]),
#     'here_speed': "category",
#     'lanes': "category",
#     'mixed_traffic_no_facil': "boolean",
#     'mixed_traffic_w_facil': "boolean"
# }
# links.set_index(['linkid','reverse_link'],inplace=True)
# turns.set_index(['source_linkid','source_reverse_link','target_linkid','target_reverse_link'],inplace=True)
# links.columns
# # #unit conversions
# #links['length_mi'] = (links['length_ft'] / 5280).round(2)
# #links['ascent_ft'] = (links['ascent_m'] * 3.28084).round(0)
# #links.drop(columns=['length_ft','ascent_m'],inplace=True)
# test_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in full_set.items()]
# test_summary = summarize_route.procees_summary_results(test_summary,config['projected_crs_epsg'])
# test_summary.drop(columns=['tripid','geometry']).describe()
# train_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in train_set.items()]
# train_summary = summarize_route.procees_summary_results(train_summary,config['projected_crs_epsg'])
# train_summary.drop(columns=['tripid','geometry']).describe()
# train_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in train_set.items()]
# train_summary = summarize_route.procees_summary_results(train_summary,config['projected_crs_epsg'])
# train_summary.drop(columns=['tripid','geometry']).describe()
# summary = pd.concat([test_summary,train_summary],ignore_index=True)
# summary.to_file(config['calibration_fp']/"route_attributes.gpkg",layer='modeled')
# summary.columns
# summary = pd.concat([test_summary,train_summary],ignore_index=True)
# summary.to_file(config['calibration_fp']/"route_attributes.gpkg",layer='modeled')




# # Regression Trees
# from sklearn import tree

# testing = pd.read_csv(config['calibration_fp']/'testing_results.csv')
# training = pd.read_csv(config['calibration_fp']/'training_results.csv')

# #assume that keys are in the right order?
# loss_df = pd.concat([testing,training],ignore_index=True)
# #import trip and user characteristics
# trips_df = pd.read_pickle(cycleatl_fp/"trips_3.pkl")
# users_df = pd.read_pickle(cycleatl_fp/"users_1.pkl")
# trips_df.reset_index(drop=True,inplace=True)
# #import route attributes
# matched_summary = gpd.read_file(config['calibration_fp']/"route_attributes.gpkg",layer="matched")
# shortest_summary = gpd.read_file(config['calibration_fp']/"route_attributes.gpkg",layer="shortest")
# #consolidate trip types
# trips_df.loc[trips_df['trip_type']=='other','trip_type'] = 'Other'
# trips_df.loc[trips_df['trip_type']=='Work-related','trip_type'] = 'Work-Related'
# trips_df['trip_type'].value_counts()
# #replace userid with just the first one
# def take_first(x):
#     if isinstance(x,list):
#         return x[0]
#     return x
# users_df['userid'] = users_df['userid'].apply(take_first)
# #consolidate trip types
# trips_df.loc[trips_df['trip_type']=='other','trip_type'] = 'Other'
# trips_df.loc[trips_df['trip_type']=='Work-related','trip_type'] = 'Work-Related'
# trips_df['trip_type'].value_counts()




# ## Tree #1
# - First tree is on the non-null variables
# - The dist. between shortest and impedance were similar and so are the trees
# - Shorter trips better explained by impedance/shortest path which makes sense
#     - Use this to split longer trips? and retrain?
# - Shopping is the only significant trip type variable
# - Speed above 9 mph is usually better explained by impedance
# ## Tree #1
# - First tree is on the non-null variables
# - The dist. between shortest and impedance were similar and so are the trees
# - Shorter trips better explained by impedance/shortest path which makes sense
#     - Use this to split longer trips? and retrain?
# - Shopping is the only significant trip type variable
# - Speed above 9 mph is usually better explained by impedance
# nonulls = ['trip_type','length_mi','avg_speed_mph','(0,4]_prop', '(4,8]_prop',
#        '(8,inf]_prop', 'AADT_10000_prop', 'facility_fwd_bike lane_prop',
#        'facility_fwd_cycletrack_prop', 'facility_fwd_multi use path_prop',
#        'facility_fwd_sharrow_prop', 'here_speed_1_prop', 'here_speed_2_prop',
#        'here_speed_3_prop', 'here_speed_4_prop', 'lanes_1_prop',
#        'lanes_2_prop', 'lanes_3_prop', 'left', 'right', 'signalized',
#        'straight', 'truck_pct_5_prop', 'uturn']
# nonulls_tree_df = merged[nonulls]
# from sklearn.tree import plot_tree

# #convert nominal categorical to numeric
# one_hot_data = pd.get_dummies(nonulls_tree_df,drop_first=True)
# one_hot_data.columns
# X, y = one_hot_data.values, merged['impedance'].values
# clf = tree.DecisionTreeRegressor(max_depth=3,min_samples_split=50)
# clf = clf.fit(X, y)
# fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
# plot_tree(clf, feature_names=one_hot_data.columns, filled=True, ax=ax)
# plt.show()
# X, y = one_hot_data.values, merged['impedance'].values
# clf = tree.DecisionTreeRegressor(max_depth=5)
# clf = clf.fit(X, y)
# fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
# plot_tree(clf, feature_names=one_hot_data.columns, filled=True, ax=ax)
# plt.show()
# how to compare predicted values vs actual?
# SS_res = ((y - clf.predict(X))**2).sum()
# SS_tot = ((y - y.mean())**2).sum()
# R2 = 1 - (SS_res/SS_tot)
# R2
# ## Tree #2
# This one takes the previous variables and adds the user characterstics. Sample size is smaller due to null values

# - Looks like trip distance is still the dominant one here, really think i should start there
# - When removing trip distance and avg speed, trip type and age 55+ are the best

# **NOTE:** just noticed that age should not be dummies, need to fix that and try again
# import json
# user_data_definitions = json.load((Path.home()/'Documents/GitHub/cycleatlanta/user_data_definition.json').open('rb'))

# #add the 55+ column
# user_data_definitions['age']['6'] = '55+'
# #income has too many nulls
# tree_cols = ['age','gender','rider_history','rider_type'] + nonulls #,'total_distance_ft','avg_speed_mph','count']#,'count']#[,'cycling_freq'
# tree_df = merged[tree_cols]

# #use to detect null values
# isnull = ((tree_df == -1) | (tree_df == 'NULL'))

# #TODO do cross-sectionals to see which combination results in the most retained entries
# #remove rows with null values
# tree_df = tree_df[(isnull==False).all(axis=1)]

# loss_vals = merged.loc[tree_df.index]
# get_factor = ['age','rider_history','rider_type']
# # just fyi
# # select_max_cols = ['age','income','cycling_freq']
# # #select the min for these (i.e. strong and fearless over interested but...)
# # select_min_cols = ['rider_type','rider_history']

# for col in get_factor:
#     ivd = {v:k for k, v in user_data_definitions[col].items()}
#     tree_df[col] = tree_df[col].map(ivd)
# tree_df
# #this is where i left off
# #convert nominal categorical to numeric
# dummy_cols = ['gender','trip_type']
# one_hot_data = pd.get_dummies(tree_df[dummy_cols],drop_first=True)
# comb = pd.concat([tree_df.drop(columns=dummy_cols),one_hot_data],ignore_index=False,axis=1)
# comb
# X, y = comb.values, loss_vals['impedance'].values
# clf = tree.DecisionTreeRegressor(max_depth=3,min_samples_split=50)
# clf = clf.fit(X, y)
# #tree.plot_tree(clf,feature_names=one_hot_data.columns)
# # Visualize the tree with higher resolution
# plt.figure(figsize=(20, 10), dpi=300)
# from sklearn.tree import plot_tree
# plot_tree(clf, feature_names=comb.columns, filled=True)
# plt.show()
# # how to compare predicted values vs actual?
# SS_res = ((y - clf.predict(X))**2).sum()
# SS_tot = ((y - y.mean())**2).sum()
# R2 = 1 - (SS_res/SS_tot)
# R2
# What we actually want to know, is what variables help increase the overlap? So wouldn't that be more of an application for linear regression?

# clf
# Since the squared error is already pretty high and our histogram tells us, most of these trips are not currently well predicted with the current impedances. The idea behind using regression trees here is can we figure out if attributes help exmplain the bad overlap?

# Or we could just try knocking off 
# #correlation matrix
# import numpy as np
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Calculate the correlation matrix
# correlation_matrix = one_hot_data.corr()

# # Display the correlation matrix
# print("Correlation Matrix:")
# print(correlation_matrix)

# # Plot the correlation matrix using seaborn heatmap
# plt.figure(figsize=(8, 6))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
# plt.title('Correlation Matrix Heatmap')
# plt.show()

# one_hot_data.corr()
# # Distribution of Loss Function
# # Export to get route attributes
# # Using BRouter Results
# To compare across we'll use Frechet distance. Will need to use next time.
# with (config['calibration_fp']/'brouter_links.txt').open('r') as file:
#     my_list = file.readlines()
#     # Remove any extra whitespace or newline characters
#     my_list = [line.strip() for line in my_list]
# len(my_list)
# geojsons = list((config['calibration_fp']/'GeoJSON_Out').glob('*.geojson'))
# both_ods = list(set.union(set(train_ods),set(test_ods)))
# len(both_ods)
# test_ods
# len(geojsons)
# #use the results dict combined with the geo dict one
# results_dict[(68166811, 8789832117)]
# #
# geojson_geos = []
# for geojson in geojsons:
#     geojson_geo = gpd.read_file(geojson).to_crs(config['projected_crs_epsg'])
#     geojson_geo = np.array(geojson_geo.geometry.item().coords)
#     # geojson_geo = [(x, y) for x, y, z in geojson_geo.coords]
#     # geojson_geo = LineString(geojson_geo)
#     geojson_geos.append(geojson_geo)
# frechet_distance = similaritymeasures.frechet_dist(chosen_coords,modeled_coords)
# #import and compare frechet distance across them with geodict?
# list(geojson_geo.coords)