# Calibration QAQC
- Visualize calibration routes and compare to the chosen and shortest routes
- Trip-specific impedance routing to see if chosen route can be found
- Test different objective functions
- Try using coordinates in case the map matched trace is incorrect

In [None]:
from pathlib import Path
import time
import pandas as pd
import geopandas as gpd
import numpy as np
import pickle
import networkx as nx
from stochopy.optimize import minimize
from tqdm import tqdm
import similaritymeasures
import random
import matplotlib.pyplot as plt
from shapely.ops import Point, MultiLineString, LineString
from importlib import reload
import datetime
from scipy.spatial.distance import directed_hausdorff
np.set_printoptions(suppress=True)

from bikewaysim.paths import config, stadia_toner, maptiler_streets
from bikewaysim.impedance_calibration import stochastic_optimization, speedfactor
from bikewaysim.network import modeling_turns
from bikewaysim.routing import rustworkx_routing_funcs

# from step_1_calibration_experiments import all_calibrations, full_model

# Import relevant files

In [None]:
links, turns, length_dict, geo_dict, turn_G = rustworkx_routing_funcs.import_calibration_network(config)
with (config['calibration_fp']/'ready_for_calibration_stats.pkl').open('rb') as fh: # has loss values for shortest path
    full_set = pickle.load(fh)
full_ods = stochastic_optimization.match_results_to_ods_w_year(full_set)
# for adding coordinates to the viz part
with (config['cycleatl_fp']/"rdp.pkl").open('rb') as fh:
    coords = pickle.load(fh)

with (config['cycleatl_fp']/"trips_2.pkl").open('rb') as fh:
    trips = pickle.load(fh)
with (config['cycleatl_fp']/"users_2.pkl").open('rb') as fh:
    users = pickle.load(fh)
trips.set_index('tripid',inplace=True)
users.set_index('userid',inplace=True)
trips = trips.loc[list(full_set.keys())]
users = users.loc[users.index.isin(set(list(trips['userid'])))]
user_map = trips['userid'].to_dict()

In [None]:
# retrieves the calibration result for each model calibrated
calibration_results_fps = (config['calibration_fp']/'user_calibration_results').glob('*.pkl')
calibration_results = {}
for calibration_results_fp in calibration_results_fps:
    with calibration_results_fp.open('rb') as fh:
        calibration_results[calibration_results_fp.stem] = pickle.load(fh)

# retrieves post routing and loss values results for each model calibrated
post_calibration_fps = (config['calibration_fp']/'user_post_calibration_loss').glob('*.pkl')
post_calibration = {}
for post_calibration_fp in post_calibration_fps:
    with post_calibration_fp.open('rb') as fh:
        post_calibration[post_calibration_fp.stem] = pickle.load(fh)


In [None]:
# what do the betas look like?
betas_vals = stochastic_optimization.post_calibration_betas(user=True)
aggregated_loss = stochastic_optimization.post_calibration_aggregated(user=True)
merged = pd.merge(betas_vals,aggregated_loss,on=['userid','run_number','calibration'])

# Exploring the calibration results for a specific user

In [None]:
# TEMP
# restrict to one user and model name for exploration
userid = 21
model_name = 'jaccard_buffer_mean '
# model_name = 'break stuff'
post_calibration_subset = {key:item for key, item in post_calibration.items() if (key.split('_')[0] == str(userid)) & (model_name in key)}
calibration_results_subset = {key:item for key, item in calibration_results.items() if (key.split('_')[0] == str(userid)) & (model_name in key)}

# retrieves the trips to look at so that we don't sample something out of range
modeled_users = set([x.split('_')[0] for x in post_calibration.keys()]) # checks to see model name co
subset_trips = [x for x in trips.index if user_map[x] in modeled_users]

Objective value variation

In [None]:
optimums = [(item['results'].fun,item['results'].nfev,item['results'].nit,item['results'].nfev / item['results'].nit ) for _, item in calibration_results_subset.items()]
optimums = pd.DataFrame(optimums,columns=['obj','nfev','nit','particles'])
optimums['obj'].abs().hist(color='grey',bins=np.arange(0, 1.05, 0.05))
plt.xlabel('Objective Function Value')
plt.ylabel('Frequency')
plt.xlim([0, 1])
plt.title(f'Objective Function Values for User {userid} (N={len(optimums)})')
plt.show()

## Objective function against particle size
Larger the particle size, the better chance it will find a high objective value

In [None]:
optimums['obj'] = optimums['obj'].abs()
optimums.plot.scatter(x='particles',y='obj')

# Scatter plot of coefficient values vs optimization results


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import math

# TODO make the x axis the same
# Sample DataFrame (replace this with your actual DataFrame)
# Assuming 'x' is your x-axis and the remaining columns are the y-axes
# data = test

# Extract x and y columns
y = merged.loc[merged['userid']==f"{userid}",'jaccard_buffer_mean']
x_columns = ['2lpd', '3+lpd', '(30,40] mph',
       '(40,inf) mph', '[4k,10k) aadt', '[10k,inf) aadt', '[4,6) grade',
       '[6,inf) grade', 'bike lane', 'cycletrack', 'multi use path',
       'unsig_crossing']

# Set up the grid dimensions: 3 columns and as many rows as needed
n_cols = 4
n_rows = math.ceil(len(x_columns) / n_cols)  # Dynamically calculates number of rows needed

# Create a figure with subplots arranged in a grid
fig, axes = plt.subplots(n_rows, n_cols, figsize=(10 * n_rows , 4 * n_cols))  # Adjust figure size
axes = axes.flatten()  # Flatten the axes array for easy iteration

# Plot scatter plots for each y-column against the x-column
for i, x_col in enumerate(x_columns):
    axes[i].scatter(merged.loc[merged['userid']==f"{userid}",x_col], y, label=x_col)
    axes[i].set_xlabel(x_col)
    axes[i].set_ylabel('Objective Function')
    # axes[i].set_title(f' vs {x_col}')
    # axes[i].legend()

# Remove any unused subplots if y_columns is not a multiple of 3
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Add layout adjustments
plt.tight_layout()

# Show the plot
plt.show()

# Examining across users

## Objective function value and coefficient varation across users for the best model

In [None]:
# subset to the full model
cols = [
    '2lpd',
    '3+lpd',
    '(30,40] mph',
    '(40,inf) mph',
    '[4k,10k) aadt',
    '[10k,inf) aadt',
    '[4,6) grade',
    '[6,inf) grade',
    'bike lane',
    'cycletrack',
    'multi use path',
    'unsig_crossing',
]
merged = merged[merged[cols].notna().all(axis=1)]

In [None]:
idxmax = merged.groupby('userid')['jaccard_buffer_mean'].idxmax()
best_model = merged.loc[idxmax]
best_model['n_runs'] = best_model['userid'].map(merged.groupby('userid').size())

Clustering based on calibrated coefficients for each user?

In [None]:
# Future work, these results seem pretty meaningless right now
# #DBSCAN
# from sklearn.cluster import DBSCAN, KMeans
# X = best_model[cols].values
# clustering = DBSCAN(eps=2, min_samples=2).fit(X)
# best_model['label'] = clustering.labels_
# print(len(set(clustering.labels_)),'clusters')
# print(best_model['label'].value_counts())

# from sklearn.metrics import silhouette_score

# # Calculate the silhouette score (higher is better)
# sil_score = silhouette_score(X, clustering.labels_)
# print(f'Silhouette Score: {sil_score}')


# #K-MEANS
# #Set number of clusters (this can be tuned or determined using methods like the elbow method)
# n_clusters = 5

# # Apply KMeans clustering
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# clusters = kmeans.fit_predict(X)
# best_model['label'] = clusters
# print(len(set(clusters)),'clusters')
# print(best_model['label'].value_counts())


# #Hierarchical Clustering


# Histogram of User Coefficients (in progress)

In [None]:
# get the shortest path ones
with (config['calibration_fp']/'ready_for_calibration_stats.pkl').open('rb') as fh:
    full_set = pickle.load(fh)
shortest = [[trips['userid'].loc[tripid],item['shortest_jaccard_buffer']] for tripid, item in full_set.items()]
shortest = pd.DataFrame(shortest,columns=['userid','jaccard_buffer'])
shortest = shortest.groupby('userid')['jaccard_buffer'].mean()
shortest.name = 'shortest_jaccard_buffer_mean'
shortest.index = [str(int(x)) for x in shortest.index]
best_model = best_model.merge(shortest,left_on='userid',right_index=True)

In [None]:
#TODO make this into a function

import pandas as pd
import matplotlib.pyplot as plt
#aggregated results and overlap figures
mean_values = []

# make figures
# Create the histogram
plt.figure(figsize=(12, 12))
plt.hist(best_model['shortest_jaccard_buffer_mean'], bins=20, alpha=0.5, label='Shortest Path Overlap', color='grey')
plt.hist(best_model['jaccard_buffer_mean'], bins=20, alpha=0.3, label='Calibrated Overlap', color='blue')

# Adding labels, title, and legend with font size adjustments
plt.xlabel('Overlap', fontsize=22)
plt.ylabel(f'Frequency (N={best_model.shape[0]})', fontsize=22)
plt.title('User by User (best result)', fontsize=16)
plt.legend(title='Jaccard Index', fontsize=22, title_fontsize=22)
plt.ylim([0,100])

# Adjusting the font size of the tick labels
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)

# Show the plot
# plt.savefig(config['calibration_fp']/'calibration_performance'/(post_calibration_result_fp.stem + '.png'))


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import math

#TODO make bin sizes equal

# Sample DataFrame (replace this with your actual DataFrame)
# Assuming 'x' is your x-axis and the remaining columns are the y-axes
# data = test

# Extract x and y columns

# min_x = best_model[cols].min().min()
# max_x = best_model[cols].max().max()
min_x = -1
max_x = 5

# Set up the grid dimensions: 3 columns and as many rows as needed
n_cols = 4
n_rows = math.ceil(len(cols) / n_cols)  # Dynamically calculates number of rows needed

# Create a figure with subplots arranged in a grid
fig, axes = plt.subplots(n_rows, n_cols, figsize=(10 * n_rows , 4 * n_cols))  # Adjust figure size
axes = axes.flatten()  # Flatten the axes array for easy iteration

# Plot scatter plots for each y-column against the x-column
multiplier = 10
start = int(-1 * multiplier)
increment = int(0.1 * multiplier)
end = int((5 + increment) * multiplier)

for i, col in enumerate(cols):
    axes[i].hist(best_model[col], label=col, bins =[x/multiplier for x in range(start,end,increment)], color = 'grey')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel(f'Frequency (N={best_model.shape[0]})')
    axes[i].set_xlim(min_x,max_x)
    axes[i].set_ylim(0,300)

# Remove any unused subplots if y_columns is not a multiple of 3
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Add layout adjustments
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
cols