# Post Calibration
In this we look at the quality of the calibrated impedance functions.
---
1. Recalculate shortest path results using the calibrated coefficients
1. Calculate objective functions and other performance metrics
1. Get route attribute summary for the chosen/shortest/modeled routes (right now)


Still Working On
1. Look at where calibrated function did the best/worst job for both the training/testing set
1. Cluster/segment results based on loss function value?
4. Export for application in BikewaySim

In [1]:
from pathlib import Path
import time
import pandas as pd
import geopandas as gpd
import numpy as np
import pickle
import networkx as nx
from stochopy.optimize import minimize
from tqdm import tqdm
import similaritymeasures
import random
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
from shapely.ops import LineString, MultiLineString
from importlib import reload

from bikewaysim.paths import config
from bikewaysim.network import modeling_turns
from bikewaysim.impedance_calibration import stochastic_optimization, post_calibration, loss_functions
from bikewaysim.routing import rustworkx_routing_funcs

In [2]:
beta_vals = post_calibration.betas_dataframe()
loss_vals = post_calibration.aggregated_loss_dataframe()
meta = post_calibration.metadata_dataframe()
dissaggregate = post_calibration.post_calibration_disaggregate()
shortest_dissaggregate = post_calibration.shortest_disaggregate()

# drop the user calibrations
meta = meta[pd.to_numeric(meta['subset'], errors='coerce').isna()]
beta_vals = beta_vals[pd.to_numeric(beta_vals['subset'], errors='coerce').isna()]
loss_vals = loss_vals[pd.to_numeric(loss_vals['subset'], errors='coerce').isna()]

# concatenate columns of interestss
join_cols = ['subset','calibration_name','run_num']
loss_cols = ['jaccard_buffer_mean','jaccard_exact_mean','jaccard_buffer_total','jaccard_exact_total','shortest_jaccard_exact_mean', 'shortest_jaccard_exact_total',
       'shortest_jaccard_buffer_mean', 'shortest_jaccard_buffer_total']
meta_cols = ['objective_function','time','set_to_zero','set_to_inf','method','popsize','num_trips','status']

concat = [beta_vals.set_index(join_cols), loss_vals.set_index(join_cols)[loss_cols], meta.set_index(join_cols)[meta_cols]] 
concat = pd.concat(concat,ignore_index=False,axis=1).reset_index()

100%|██████████| 2940/2940 [01:03<00:00, 46.03it/s]  


In [7]:
# print the shortest path one
print(loss_vals.loc[loss_vals['subset']=='random',['shortest_jaccard_exact_mean', 'shortest_jaccard_exact_total',
       'shortest_jaccard_buffer_mean', 'shortest_jaccard_buffer_total']].drop_duplicates())

      shortest_jaccard_exact_mean  shortest_jaccard_exact_total  \
3081                         0.33                          0.23   

      shortest_jaccard_buffer_mean  shortest_jaccard_buffer_total  
3081                          0.36                           0.24  


Segment

In [8]:
# rider type
rider_type_models = concat[concat['subset'].isin(['fearless','notfearless'])]
# rider_type_models.to_csv(config['scratch_fp']/'rider_type.csv',index=False)

# random/all models
best_models = concat[concat['subset'].isin(['random','all'])].dropna(axis=1,how='all').sort_values('jaccard_buffer_mean',ascending=False).dropna(axis=1,how='all')[concat['num_trips'] > 100].head(50)
# best_models.to_csv(config['scratch_fp']/'best_models.csv',index=False)

# just the random ones
best_models[best_models['subset']=='random'].sort_values('time',ascending=False).to_csv(config['scratch_fp']/'random.csv',index=False)

In [18]:
# get results of testing the different objective funciton values
run_nums = ['7','6','1','0']
best_models.loc[(best_models['subset']=='random') & (best_models['calibration_name']=='no traffic') & (best_models['run_num'].isin(run_nums)),['objective_function']+[x for x in loss_cols if 'shortest_' not in x]]

Unnamed: 0,objective_function,jaccard_buffer_mean,jaccard_exact_mean,jaccard_buffer_total,jaccard_exact_total
635,jaccard_buffer_total,0.48,0.44,0.35,0.32
634,jaccard_buffer_mean,0.48,0.43,0.35,0.31
629,jaccard_exact_total,0.48,0.43,0.36,0.32
628,jaccard_exact_mean,0.48,0.44,0.35,0.32


In [17]:
best_models

Unnamed: 0,subset,calibration_name,run_num,2lpd,3+lpd,"(30,40] mph","(40,inf) mph","[4k,10k) aadt","[10k,inf) aadt","[4,6) grade",...,shortest_jaccard_buffer_mean,shortest_jaccard_buffer_total,objective_function,time,set_to_zero,set_to_inf,method,popsize,num_trips,status
635,random,no traffic,7,0.115,0.412,,,,,0.815,...,0.36,0.24,jaccard_buffer_total,2024-10-22 09:00:35.062824,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,-1.0
634,random,no traffic,6,-0.038,0.315,,,,,0.748,...,0.36,0.24,jaccard_buffer_mean,2024-10-21 20:07:42.432624,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,-1.0
629,random,no traffic,1,0.111,0.408,,,,,0.876,...,0.36,0.24,jaccard_exact_total,2024-10-21 17:12:46.747220,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,-1.0
628,random,no traffic,0,-0.033,0.303,,,,,0.474,...,0.36,0.24,jaccard_exact_mean,2024-10-21 16:11:32.035248,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,-1.0
643,random,turns,3,0.13,0.616,,,-0.209,-0.215,0.732,...,0.36,0.24,jaccard_buffer_mean,2024-10-20 18:15:40.840634,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,-1.0
633,random,no traffic,5,0.037,0.642,,,,,0.959,...,0.36,0.24,jaccard_buffer_mean,2024-10-21 18:56:54.813563,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,-1.0
631,random,no traffic,3,0.139,0.37,,,,,0.375,...,0.36,0.24,jaccard_exact_mean,2024-10-21 17:42:56.410121,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,-1.0
627,random,iterations,3,0.004,0.591,,,,,0.448,...,0.36,0.24,jaccard_exact_mean,2024-10-21 22:53:55.414336,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,-1.0
612,random,baseline,1,0.076,0.286,,,-0.176,-0.042,0.598,...,0.36,0.24,jaccard_buffer_mean,2024-10-18 17:12:27.273317,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,1.0
613,random,baseline,2,0.156,0.707,,,-0.162,-0.258,0.587,...,0.36,0.24,jaccard_buffer_mean,2024-10-18 19:07:46.718860,"[bike lane, cycletrack, multi use path]",[not_street],pso,25.0,664.0,1.0


## Look at the objective function values over time

In [9]:
with (config['calibration_fp']/f'results/random,iterations,3.pkl').open('rb') as fh:
    best = pickle.load(fh)
np.array(best['results'].funall).min(axis=1)

array([-0.3981, -0.3981, -0.3981, -0.4001, -0.4002, -0.4063, -0.4145,
       -0.417 , -0.4052, -0.4115, -0.4101, -0.4204, -0.4179, -0.4195,
       -0.4212])

In [10]:
best['results'].funall[0]

array([-0.3774, -0.3271, -0.3306, -0.3237, -0.368 , -0.3352, -0.3316,
       -0.3599, -0.3981, -0.3539, -0.342 , -0.3471, -0.3472, -0.3202,
       -0.375 , -0.3675, -0.3101, -0.3489, -0.3476, -0.315 , -0.3852,
       -0.3387, -0.3226, -0.3749, -0.3538])

## Dissaggregate

In [None]:
# subset to the best model so far
cond = (dissaggregate.columns.get_level_values(0) == 'all') & \
    (dissaggregate.columns.get_level_values(1) == 'jaccard_buffer_mean') & \
    (dissaggregate.columns.get_level_values(2) == '2') & \
    (dissaggregate.columns.get_level_values(3).isin(['jaccard_buffer','detour','length']))
best_model = dissaggregate.loc[:,cond].droplevel([0,1,2],axis=1)
shortest_dissaggregate = shortest_dissaggregate[['trip_start_time','match_ratio','chosen_length','shortest_length','chosen_detour','shortest_jaccard_buffer']]

In [None]:
best_model = pd.concat([shortest_dissaggregate,best_model],ignore_index=False,axis=1)
best_model

## Create the loss function plots

In [None]:
#TODO make this into a function

import pandas as pd
import matplotlib.pyplot as plt
#aggregated results and overlap figures
mean_values = []
for idx, post_calibration_result_fp in enumerate(post_calibration_result_fps):
    print('Calibration result',post_calibration_result_fp.stem)
    with post_calibration_result_fp.open('rb') as fh:
        post_calibration_result = pickle.load(fh)

    jaccard_mean = np.array([item['modeled_jaccard'] for tripid, item in post_calibration_result.items()]).mean()
    buffer_mean = np.array([item['modeled_buffer'] for tripid, item in post_calibration_result.items()]).mean()
    mean_values.append((post_calibration_result_fp.stem,round(jaccard_mean,2),round(buffer_mean,2)))

    print('Mean Jaccard Index',round(jaccard_mean,2),'Mean Buffer',round(buffer_mean,2))
    # with (config['calibration_fp']/'post_calibration'/(calibration_result_fp.stem+'.pkl')).open('wb') as fh:
    #     pickle.dump(modeled_results_dict,fh)

    #get just jaccard values
    modeled_jaccard = {tripid:item['modeled_jaccard'] for tripid, item in post_calibration_result.items()}
    modeled_jaccard = pd.Series(modeled_jaccard)
    modeled_jaccard.name = 'modeled_jaccard'
    df = pd.concat([shortest_jaccard,modeled_jaccard],axis=1,ignore_index=False)

    # make figures
    # Create the histogram
    plt.figure(figsize=(12, 12))
    plt.hist(df['shortest_jaccard'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
    plt.hist(df['modeled_jaccard'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

    # Adding labels, title, and legend with font size adjustments
    plt.xlabel('Overlap', fontsize=22)
    plt.ylabel(f'Frequency (N={df.shape[0]})', fontsize=22)
    plt.title(post_calibration_result_fp.stem, fontsize=16)
    plt.legend(title='Jaccard Index', fontsize=22, title_fontsize=22)
    plt.ylim([0,700])

    # Adjusting the font size of the tick labels
    plt.xticks(fontsize=22)
    plt.yticks(fontsize=22)

    # Show the plot
    plt.savefig(config['calibration_fp']/'calibration_performance'/(post_calibration_result_fp.stem + '.png'))


# SORT below here

# Calculate all the possible overlap metrics

In [None]:
import impedance_calibration.src.summarize_route as summarize_route
reload(summarize_route)

chosen_attr = {tripid:summarize_route.route_attributes1(tripid,full_set[tripid]['matched_edges'].values,links,turns) for tripid in full_set.keys()}
shortest_attr = {tripid:summarize_route.route_attributes1(tripid,full_set[tripid]['shortest_edges'].values,links,turns) for tripid in full_set.keys()} 
modeled_attr = {tripid:summarize_route.route_attributes1(tripid,full_set[tripid]['modeled_edges'].values,links,turns) for tripid in full_set.keys()}

In [None]:
chosen_attr = pd.DataFrame.from_dict(chosen_attr,orient='index')
shortest_attr = pd.DataFrame.from_dict(shortest_attr,orient='index')
modeled_attr = pd.DataFrame.from_dict(modeled_attr,orient='index')

In [None]:
results_df.reset_index().to_csv(config['calibration_fp']/'objective_functions.csv',index=False)

In [None]:
results_df = pd.read_csv(config['calibration_fp']/'objective_functions.csv')

In [None]:
results_df.mean().drop('index').to_dict()

# Segmenting/labeling trips
Want to know which attributes help lead to a loss function value. 

## Tree #1
Want to see if percent detour or if any of the chosen route characterstics (that were accounted for in the calibration process) contributed to the overlap value (modeled_jaccard)

In [None]:
tree_df = pd.concat([results_df[['chosen_length','chosen_detour','modeled_jaccard']],chosen_attr],axis=1)

tree_df['lanes_above_1'] = tree_df['lane_2'] + tree_df['lane_3']
tree_df.drop(columns=['lane_0','lane_1','lane_2','lane_3'],inplace=True)

First split is the chosen detour rate. 1,377 trips (over half) have a detour rate above 10.5% and this is associated with a bigger reduction in overlap. Past this it appears that trips that took a lot 

Second split is on chosen length and chosen detour rate. 

In [None]:
from sklearn.tree import plot_tree
from sklearn import tree

X, y = tree_df.drop(columns=['modeled_jaccard']).values, tree_df['modeled_jaccard'].values
clf = tree.DecisionTreeRegressor(max_depth=3,min_samples_split=50)
clf = clf.fit(X, y)
fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
plot_tree(clf, feature_names=tree_df.drop(columns=['modeled_jaccard']).columns, filled=True, ax=ax)
plt.show()

In [None]:
#normal regression
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

X, y = tree_df.drop(columns=['modeled_jaccard']), tree_df['modeled_jaccard']
X = sm.add_constant(X)
model_sm = sm.OLS(y, X).fit()
print(model_sm.summary())

In [None]:
tree_df.sort_values('unsig_major_road_crossing',ascending=False).head(40)

In [None]:
X, y = one_hot_data.values, merged['impedance'].values
clf = tree.DecisionTreeRegressor(max_depth=5)
clf = clf.fit(X, y)
fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
plot_tree(clf, feature_names=one_hot_data.columns, filled=True, ax=ax)
plt.show()
how to compare predicted values vs actual?
SS_res = ((y - clf.predict(X))**2).sum()
SS_tot = ((y - y.mean())**2).sum()
R2 = 1 - (SS_res/SS_tot)
R2

In [None]:
# with (config['calibration_fp']/"full_modeled_results.pkl").open('rb') as fh:
#     modeled_results = pickle.load(fh)
## Add Route Attributes
import impedance_calibration.src.summarize_route as summarize_route
cols_to_summarize = {
    'facility_fwd': "category",
    'AADT': ("threshold",[10000]),
    'truck_pct': ("threshold",[5]),
    'speed': "category",
    'lanes': "category",
    'mixed_traffic_no_facil': "boolean",
    'mixed_traffic_w_facil': "boolean"
}
links.set_index(['linkid','reverse_link'],inplace=True)
turns.set_index(['source_linkid','source_reverse_link','target_linkid','target_reverse_link'],inplace=True)
links.columns
# #unit conversions
#links['length_mi'] = (links['length_ft'] / 5280).round(2)
#links['ascent_ft'] = (links['ascent_m'] * 3.28084).round(0)
#links.drop(columns=['length_ft','ascent_m'],inplace=True)
test_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in full_set.items()]
test_summary = summarize_route.procees_summary_results(test_summary,config['projected_crs_epsg'])
test_summary.drop(columns=['tripid','geometry']).describe()
train_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in train_set.items()]
train_summary = summarize_route.procees_summary_results(train_summary,config['projected_crs_epsg'])
train_summary.drop(columns=['tripid','geometry']).describe()
train_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in train_set.items()]
train_summary = summarize_route.procees_summary_results(train_summary,config['projected_crs_epsg'])
train_summary.drop(columns=['tripid','geometry']).describe()
summary = pd.concat([test_summary,train_summary],ignore_index=True)
# summary.to_file(config['calibration_fp']/"route_attributes.gpkg",layer='modeled')

In [None]:
results_df

## Calculate the percent change in impedance at the link level for visualization

In [None]:
links.columns

In [None]:
impedance_change = links.copy()
impedance_change['imp_prop'] = (impedance_change['link_cost'] - impedance_change['travel_time_min']) / impedance_change['travel_time_min']
impedance_change['imp_prop'] = impedance_change['imp_prop'].round(3)
impedance_change = impedance_change[impedance_change['reverse_link']==False]

In [None]:
#todo automate the plot generation for when qgis isn't available
impedance_change.plot('imp_prop')

In [None]:
impedance_change.to_file(config['calibration_fp']/"network_impedance_change.gpkg")

In [None]:
impedance_change['imp_prop'].describe()

In [None]:
sorted(impedance_change['imp_prop'].unique())

In [None]:
(impedance_change['imp_prop']==0).any()

In [None]:
impedance_change.imp_prop.round(3).value_counts()

# Plot distribution of overlap values

In [None]:
import pandas as pd
#loss_data = pd.DataFrame({'loss_shortest_full':loss_shortest_full,'loss_full':loss_full})

import matplotlib.pyplot as plt
# Create the histogram
#ax = loss_full_df.plot.hist(stacked=True, bins=20, figsize=(12, 12), color=['grey', 'lightgrey'])

# Create the histogram
plt.figure(figsize=(12, 12))
plt.hist(full['shortest'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
plt.hist(full['impedance'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

# Adding labels, title, and legend with font size adjustments
plt.xlabel('Overlap', fontsize=22)
plt.ylabel(f'Frequency (N={full.shape[0]})', fontsize=22)
#plt.title('Histogram of Training Losses', fontsize=16)
#plt.legend(title='Tra Overlap', fontsize=22, title_fontsize=22)

# Adjusting the font size of the tick labels
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)

# Show the plot
plt.show()

In [None]:
# import pandas as pd
# loss_data = pd.DataFrame({'loss_shortest_train':loss_shortest_train,'loss_train':loss_train})

# import matplotlib.pyplot as plt
# # Create the histogram
# #ax = loss_train_df.plot.hist(stacked=True, bins=20, figsize=(12, 12), color=['grey', 'lightgrey'])

# # Create the histogram
# plt.figure(figsize=(12, 12))
# plt.hist(loss_data['loss_shortest_train'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
# plt.hist(loss_data['loss_train'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

# # Adding labels, title, and legend with font size adjustments
# plt.xlabel('Overlap', fontsize=22)
# plt.ylabel('Frequency', fontsize=22)
# #plt.title('Histogram of Training Losses', fontsize=16)
# plt.legend(title='Training Overlap', fontsize=22, title_fontsize=22)

# # Adjusting the font size of the tick labels
# plt.xticks(fontsize=22)
# plt.yticks(fontsize=22)

# # Show the plot
# plt.show()

In [None]:
# loss_data = pd.DataFrame({'loss_shortest_test':loss_shortest_test,'loss_test':loss_test})

# import matplotlib.pyplot as plt
# # Create the histogram
# #ax = loss_train_df.plot.hist(stacked=True, bins=20, figsize=(12, 12), color=['grey', 'lightgrey'])

# # Create the histogram
# plt.figure(figsize=(12, 12))
# plt.hist(loss_data['loss_shortest_test'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
# plt.hist(loss_data['loss_test'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

# # Adding labels, title, and legend with font size adjustments
# plt.xlabel('Overlap', fontsize=22)
# plt.ylabel('Frequency', fontsize=22)
# #plt.title('Histogram of Training Losses', fontsize=16)
# plt.legend(title='Testing Overlap', fontsize=22, title_fontsize=22)

# # Adjusting the font size of the tick labels
# plt.xticks(fontsize=22)
# plt.yticks(fontsize=22)

# # Show the plot
# plt.show()

# QAQC (separate into a separate notebook)

In [None]:
# with (config['calibration_fp']/"full_modeled_results.pkl").open('rb') as fh:
#     modeled_results = pickle.load(fh)
# ## Add Route Attributes
# import summarize_route
# cols_to_summarize = {
#     'facility_fwd': "category",
#     'AADT': ("threshold",[10000]),
#     'truck_pct': ("threshold",[5]),
#     'here_speed': "category",
#     'lanes': "category",
#     'mixed_traffic_no_facil': "boolean",
#     'mixed_traffic_w_facil': "boolean"
# }
# links.set_index(['linkid','reverse_link'],inplace=True)
# turns.set_index(['source_linkid','source_reverse_link','target_linkid','target_reverse_link'],inplace=True)
# links.columns
# # #unit conversions
# #links['length_mi'] = (links['length_ft'] / 5280).round(2)
# #links['ascent_ft'] = (links['ascent_m'] * 3.28084).round(0)
# #links.drop(columns=['length_ft','ascent_m'],inplace=True)
# test_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in full_set.items()]
# test_summary = summarize_route.procees_summary_results(test_summary,config['projected_crs_epsg'])
# test_summary.drop(columns=['tripid','geometry']).describe()
# train_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in train_set.items()]
# train_summary = summarize_route.procees_summary_results(train_summary,config['projected_crs_epsg'])
# train_summary.drop(columns=['tripid','geometry']).describe()
# train_summary = [summarize_route.route_attributes(key,item,'modeled_edges',cols_to_summarize,links,turns) for key, item in train_set.items()]
# train_summary = summarize_route.procees_summary_results(train_summary,config['projected_crs_epsg'])
# train_summary.drop(columns=['tripid','geometry']).describe()
# summary = pd.concat([test_summary,train_summary],ignore_index=True)
# summary.to_file(config['calibration_fp']/"route_attributes.gpkg",layer='modeled')
# summary.columns
# summary = pd.concat([test_summary,train_summary],ignore_index=True)
# summary.to_file(config['calibration_fp']/"route_attributes.gpkg",layer='modeled')




# # Regression Trees
# from sklearn import tree

# testing = pd.read_csv(config['calibration_fp']/'testing_results.csv')
# training = pd.read_csv(config['calibration_fp']/'training_results.csv')

# #assume that keys are in the right order?
# loss_df = pd.concat([testing,training],ignore_index=True)
# #import trip and user characteristics
# trips_df = pd.read_pickle(cycleatl_fp/"trips_3.pkl")
# users_df = pd.read_pickle(cycleatl_fp/"users_1.pkl")
# trips_df.reset_index(drop=True,inplace=True)
# #import route attributes
# matched_summary = gpd.read_file(config['calibration_fp']/"route_attributes.gpkg",layer="matched")
# shortest_summary = gpd.read_file(config['calibration_fp']/"route_attributes.gpkg",layer="shortest")
# #consolidate trip types
# trips_df.loc[trips_df['trip_type']=='other','trip_type'] = 'Other'
# trips_df.loc[trips_df['trip_type']=='Work-related','trip_type'] = 'Work-Related'
# trips_df['trip_type'].value_counts()
# #replace userid with just the first one
# def take_first(x):
#     if isinstance(x,list):
#         return x[0]
#     return x
# users_df['userid'] = users_df['userid'].apply(take_first)
# #consolidate trip types
# trips_df.loc[trips_df['trip_type']=='other','trip_type'] = 'Other'
# trips_df.loc[trips_df['trip_type']=='Work-related','trip_type'] = 'Work-Related'
# trips_df['trip_type'].value_counts()




# ## Tree #1
# - First tree is on the non-null variables
# - The dist. between shortest and impedance were similar and so are the trees
# - Shorter trips better explained by impedance/shortest path which makes sense
#     - Use this to split longer trips? and retrain?
# - Shopping is the only significant trip type variable
# - Speed above 9 mph is usually better explained by impedance
# ## Tree #1
# - First tree is on the non-null variables
# - The dist. between shortest and impedance were similar and so are the trees
# - Shorter trips better explained by impedance/shortest path which makes sense
#     - Use this to split longer trips? and retrain?
# - Shopping is the only significant trip type variable
# - Speed above 9 mph is usually better explained by impedance
# nonulls = ['trip_type','length_mi','avg_speed_mph','(0,4]_prop', '(4,8]_prop',
#        '(8,inf]_prop', 'AADT_10000_prop', 'facility_fwd_bike lane_prop',
#        'facility_fwd_cycletrack_prop', 'facility_fwd_multi use path_prop',
#        'facility_fwd_sharrow_prop', 'here_speed_1_prop', 'here_speed_2_prop',
#        'here_speed_3_prop', 'here_speed_4_prop', 'lanes_1_prop',
#        'lanes_2_prop', 'lanes_3_prop', 'left', 'right', 'signalized',
#        'straight', 'truck_pct_5_prop', 'uturn']
# nonulls_tree_df = merged[nonulls]
# from sklearn.tree import plot_tree

# #convert nominal categorical to numeric
# one_hot_data = pd.get_dummies(nonulls_tree_df,drop_first=True)
# one_hot_data.columns
# X, y = one_hot_data.values, merged['impedance'].values
# clf = tree.DecisionTreeRegressor(max_depth=3,min_samples_split=50)
# clf = clf.fit(X, y)
# fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
# plot_tree(clf, feature_names=one_hot_data.columns, filled=True, ax=ax)
# plt.show()
# X, y = one_hot_data.values, merged['impedance'].values
# clf = tree.DecisionTreeRegressor(max_depth=5)
# clf = clf.fit(X, y)
# fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
# plot_tree(clf, feature_names=one_hot_data.columns, filled=True, ax=ax)
# plt.show()
# how to compare predicted values vs actual?
# SS_res = ((y - clf.predict(X))**2).sum()
# SS_tot = ((y - y.mean())**2).sum()
# R2 = 1 - (SS_res/SS_tot)
# R2
# ## Tree #2
# This one takes the previous variables and adds the user characterstics. Sample size is smaller due to null values

# - Looks like trip distance is still the dominant one here, really think i should start there
# - When removing trip distance and avg speed, trip type and age 55+ are the best

# **NOTE:** just noticed that age should not be dummies, need to fix that and try again
# import json
# user_data_definitions = json.load((Path.home()/'Documents/GitHub/cycleatlanta/user_data_definition.json').open('rb'))

# #add the 55+ column
# user_data_definitions['age']['6'] = '55+'
# #income has too many nulls
# tree_cols = ['age','gender','rider_history','rider_type'] + nonulls #,'total_distance_ft','avg_speed_mph','count']#,'count']#[,'cycling_freq'
# tree_df = merged[tree_cols]

# #use to detect null values
# isnull = ((tree_df == -1) | (tree_df == 'NULL'))

# #TODO do cross-sectionals to see which combination results in the most retained entries
# #remove rows with null values
# tree_df = tree_df[(isnull==False).all(axis=1)]

# loss_vals = merged.loc[tree_df.index]
# get_factor = ['age','rider_history','rider_type']
# # just fyi
# # select_max_cols = ['age','income','cycling_freq']
# # #select the min for these (i.e. strong and fearless over interested but...)
# # select_min_cols = ['rider_type','rider_history']

# for col in get_factor:
#     ivd = {v:k for k, v in user_data_definitions[col].items()}
#     tree_df[col] = tree_df[col].map(ivd)
# tree_df
# #this is where i left off
# #convert nominal categorical to numeric
# dummy_cols = ['gender','trip_type']
# one_hot_data = pd.get_dummies(tree_df[dummy_cols],drop_first=True)
# comb = pd.concat([tree_df.drop(columns=dummy_cols),one_hot_data],ignore_index=False,axis=1)
# comb
# X, y = comb.values, loss_vals['impedance'].values
# clf = tree.DecisionTreeRegressor(max_depth=3,min_samples_split=50)
# clf = clf.fit(X, y)
# #tree.plot_tree(clf,feature_names=one_hot_data.columns)
# # Visualize the tree with higher resolution
# plt.figure(figsize=(20, 10), dpi=300)
# from sklearn.tree import plot_tree
# plot_tree(clf, feature_names=comb.columns, filled=True)
# plt.show()
# # how to compare predicted values vs actual?
# SS_res = ((y - clf.predict(X))**2).sum()
# SS_tot = ((y - y.mean())**2).sum()
# R2 = 1 - (SS_res/SS_tot)
# R2
# What we actually want to know, is what variables help increase the overlap? So wouldn't that be more of an application for linear regression?

# clf
# Since the squared error is already pretty high and our histogram tells us, most of these trips are not currently well predicted with the current impedances. The idea behind using regression trees here is can we figure out if attributes help exmplain the bad overlap?

# Or we could just try knocking off 
# #correlation matrix
# import numpy as np
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Calculate the correlation matrix
# correlation_matrix = one_hot_data.corr()

# # Display the correlation matrix
# print("Correlation Matrix:")
# print(correlation_matrix)

# # Plot the correlation matrix using seaborn heatmap
# plt.figure(figsize=(8, 6))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
# plt.title('Correlation Matrix Heatmap')
# plt.show()

# one_hot_data.corr()
# # Distribution of Loss Function
# # Export to get route attributes
# # Using BRouter Results
# To compare across we'll use Frechet distance. Will need to use next time.
# with (config['calibration_fp']/'brouter_links.txt').open('r') as file:
#     my_list = file.readlines()
#     # Remove any extra whitespace or newline characters
#     my_list = [line.strip() for line in my_list]
# len(my_list)
# geojsons = list((config['calibration_fp']/'GeoJSON_Out').glob('*.geojson'))
# both_ods = list(set.union(set(train_ods),set(test_ods)))
# len(both_ods)
# test_ods
# len(geojsons)
# #use the results dict combined with the geo dict one
# results_dict[(68166811, 8789832117)]
# #
# geojson_geos = []
# for geojson in geojsons:
#     geojson_geo = gpd.read_file(geojson).to_crs(config['projected_crs_epsg'])
#     geojson_geo = np.array(geojson_geo.geometry.item().coords)
#     # geojson_geo = [(x, y) for x, y, z in geojson_geo.coords]
#     # geojson_geo = LineString(geojson_geo)
#     geojson_geos.append(geojson_geo)
# frechet_distance = similaritymeasures.frechet_dist(chosen_coords,modeled_coords)
# #import and compare frechet distance across them with geodict?
# list(geojson_geo.coords)