# Creating Clusters for Stochastic Optimization
---

Import packages

In [None]:
from pathlib import Path
import pickle
import pandas as pd
import geopandas as gpd
import json
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm
import contextily as cx

In [None]:
import matplotlib.pyplot as plt

# Assuming coords is your DataFrame
plt.hist(coords['speed_mph'], bins=range(0, 105, 2), edgecolor='black')
plt.xlabel('Speed (mph)')
plt.ylabel('Frequency')
plt.title('Histogram of Speeds with Bin Size of  mph')
plt.xlim(0, 30)  # Set the x-axis limits to 0 and 100
plt.show()

In [None]:
#%% remove trips with excessive speed

# '''
# These are ad hoc methods to remove obvious exercise trips

# if average gps speed reported is larger than 20 mph
# AND there are points above 30 mph then it's probably
# not on bike or is part of a very fast road ride

# Only about 90 trips affected by this
# '''

# avg_speed_max = 16
# max_speed_max = 30

# #TODO doesn't look like i got to this?
# test = coords.groupby('tripid')['speed_mph'].agg(['mean','max'])

# (test['mean'] > avg_speed_max).sum()
# (test['max'] > max_speed_max).sum()

Import cleaned traces

In [None]:
export_fp = Path.home() / 'Downloads/cleaned_trips'

#load all traces for comparisons
with (export_fp/'cleaned_traces.pkl').open('rb') as fh:
    coords_dict, trips_df = pickle.load(fh)

In [None]:
#calculate min, avg, max for speed_mph and add to trips_df
for tripid, coords in tqdm(coords_dict.items()):
    trips_df.at[trips_df['tripid']==tripid,'min_speed_mph_gps'] = coords['speed_mph'].min()
    trips_df.at[trips_df['tripid']==tripid,'avg_speed_mph_gps'] = coords['speed_mph'].mean()
    trips_df.at[trips_df['tripid']==tripid,'max_speed_mph_gps'] = coords['speed_mph'].max()

In [None]:
fig, ax = plt.subplots(ncols=3, figsize=(20, 6))

ax[0].hist(trips_df['min_speed_mph_gps'], bins=50, label='GPS Average Speed', color='skyblue', alpha=0.7)
ax[0].hist(trips_df['min_speed_mph'], bins=50, label='Calculated Average Speed', color='lightcoral', alpha=0.7)
ax[0].set_title('Distribution of Min Speeds', fontsize=16)
ax[0].set_xlabel('Speed (mph)', fontsize=14)
ax[0].set_ylabel('Frequency', fontsize=14)
ax[0].legend()

ax[1].hist(trips_df['avg_speed_mph_gps'], bins=50, label='GPS Average Speed', color='skyblue', alpha=0.7)
ax[1].hist(trips_df['avg_speed_mph'], bins=50, label='Calculated Average Speed', color='lightcoral', alpha=0.7)
ax[1].set_title('Distribution of Average Speeds', fontsize=16)
ax[1].set_xlabel('Speed (mph)', fontsize=14)
ax[1].set_ylabel('Frequency', fontsize=14)
ax[1].legend()

ax[2].hist(trips_df['max_speed_mph_gps'], bins=50, label='GPS Average Speed', color='skyblue', alpha=0.7)
ax[2].hist(trips_df['max_speed_mph'], bins=50, label='Calculated Average Speed', color='lightcoral', alpha=0.7)
ax[2].set_title('Distribution of Max Speeds', fontsize=16)
ax[2].set_xlabel('Speed (mph)', fontsize=14)
ax[2].set_ylabel('Frequency', fontsize=14)
ax[2].legend()

# Displaying the plot
plt.show()


In [None]:
#load raw version for kalman filtering
with (export_fp/'raw_coords.pkl').open('rb') as fh:
    raw_coords_dict, _ = pickle.load(fh)
del _

In [None]:
#tripid = 10803
tripid = trips_df['tripid'].sample(1).item()
kalman_trip = raw_coords_dict[tripid]
cleaned_trip = coords_dict[tripid].iloc[1:,:]

In [None]:
kalman_trip.loc[kalman_trip['speed_mph']<0,'speed_mph'] = np.nan
cleaned_trip.loc[cleaned_trip['speed_mph']<0,'speed_mph'] = np.nan
#make a time elapsed in seconds column
cleaned_trip['time_elapsed'] = cleaned_trip['datetime'].apply(lambda x: int((x - cleaned_trip['datetime'].min()).total_seconds()))

In [None]:
kalman_trip[['hAccuracy_ft','geometry']].explore()

In [None]:
#plot cleaned vs raw data
fig, ax = plt.subplots(figsize=(20,15))
kalman_trip.plot(ax=ax,color='red',label='Raw Points')
cleaned_trip.plot(ax=ax,color='black',label='Filtered Points')
#ax.plot(kalman_trip.geometry,label='Smoothed Displacement',alpha=0.5)
#ax.plot(cleaned_trip.geometry,label='Calculated Speed',alpha=0.5)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.legend()
ax.set_title(f'Raw vs Filtered for Trip {tripid}')
cx.add_basemap(ax,crs=kalman_trip.crs,source=cx.providers.CartoDB.Positron)
plt.show()

Try kalman filter

In [None]:
from pykalman import KalmanFilter
import numpy.ma as ma

#convert mph to fps
kalman_trip['speed_fps'] = kalman_trip['speed_mph'] * 1.46667

#make a time elapsed in seconds column (and round to nearest second)
kalman_trip['time_elapsed'] = kalman_trip['datetime'].apply(lambda x: int((x - kalman_trip['datetime'].min()).total_seconds()))

#create nan entries to fill in missing data
fill = pd.DataFrame(data={'time_elapsed':range(0,kalman_trip['time_elapsed'].max()+1)})
fill = pd.merge(fill,kalman_trip,on='time_elapsed',how='left')

#convert our observations to numpy array
observations = fill['speed_fps'].to_numpy()

#use np.ma to mask missing data
observations = ma.masked_array(observations , mask=np.isnan(observations))

# the initial state of the cyclist (traveling at constant speed starting a displacement zero)
initial_state_mean = [0,0,0]

#these are the kinematics of how we're moving
transition_matrix = [[1,1,1/2], # velocity
                     [0,1,1], # acceleration
                     [0,0,1]] # jerk

observation_matrix = [1,0,0] # we only measure velocity

observation_covariance = np.eye(1) * (2 * 1.46667)

#using just this estimate a kalman filter
kf1 = KalmanFilter(transition_matrices = transition_matrix,
                  observation_matrices = observation_matrix,
                  initial_state_mean = initial_state_mean,
                  observation_covariance = observation_covariance,
                  em_vars=['transition_covariance']
                  )

kf1 = kf1.em(observations, n_iter=20)
(smoothed_state_means, smoothed_state_covariances) = kf1.smooth(observations)

#convert back to dataframe
smoothed = pd.DataFrame(smoothed_state_means,columns=['v','a','j'])
smoothed['v_mph'] = smoothed['v'] / 1.46667
smoothed.reset_index(inplace=True)
smoothed.rename(columns={'index':'time'},inplace=True)

In [None]:
# from pykalman import KalmanFilter
# import numpy.ma as ma

# #convert mph to fps
# kalman_trip['speed_fps'] = kalman_trip['speed_mph'] * 1.46667

# #make a time elapsed in seconds column (and round to nearest second)
# kalman_trip['time_elapsed'] = kalman_trip['datetime'].apply(lambda x: int((x - kalman_trip['datetime'].min()).total_seconds()))

# #create nan entries to fill in missing data
# fill = pd.DataFrame(data={'time_elapsed':range(0,kalman_trip['time_elapsed'].max()+1)})
# fill = pd.merge(fill,kalman_trip,on='time_elapsed',how='left')

# #convert our observations to numpy array
# observations = fill['speed_fps'].to_numpy()

# #use np.ma to mask missing data
# observations = ma.masked_array(observations , mask=np.isnan(observations))

# # the initial state of the cyclist (traveling at constant speed starting a displacement zero)
# initial_state_mean = [0,observations[0],0,0]

# #these are the kinematics of how we're moving
# transition_matrix = [[1,1,1/2,1/6], # position/diplacement
#                      [0,1,1,1/2], # velocity
#                      [0,0,1,1], # acceleration
#                      [0,0,0,1]] # jerk

# observation_matrix = [0,1,0,0] # we only measure velocity

# observation_covariance = np.eye(1) * (2 * 1.46667)

# #using just this estimate a kalman filter
# kf1 = KalmanFilter(transition_matrices = transition_matrix,
#                   observation_matrices = observation_matrix,
#                   initial_state_mean = initial_state_mean,
#                   observation_covariance = observation_covariance,
#                   em_vars=['transition_covariance']
#                   )

# kf1 = kf1.em(observations, n_iter=20)
# (smoothed_state_means, smoothed_state_covariances) = kf1.smooth(observations)

# #convert back to dataframe
# smoothed = pd.DataFrame(smoothed_state_means,columns=['x','v','a','j'])
# smoothed['v_mph'] = smoothed['v'] / 1.46667
# smoothed.reset_index(inplace=True)
# smoothed.rename(columns={'index':'time'},inplace=True)

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(20, 15))

# # First subplot
# axs[0].plot(smoothed['time'], smoothed['x'] / 5280, label='Smoothed Displacement', alpha=0.5)
# axs[0].set_xlabel('Time (s)')
# axs[0].set_ylabel('Displacement (mi)')
# axs[0].set_title(f'Displacement vs. Time for Trip {tripid}')

# Second subplot
axs[0].plot(kalman_trip['time_elapsed'], kalman_trip['speed_mph'], label='Measured Speed', alpha=0.5)
axs[0].plot(smoothed['time'], smoothed['v_mph'], label='Smoothed Speed', alpha=0.5)
axs[0].plot(cleaned_trip['time_elapsed'], cleaned_trip['speed_mph'], label='Calculated Speed', alpha=0.5)
axs[0].set_xlabel('Time (s)')
axs[0].set_ylabel('Speed (mph)')
axs[0].legend()
axs[0].set_title(f'Speed vs. Time for Trip {tripid}')

# Third subplot
axs[1].plot(smoothed['time'], smoothed['a'], alpha=0.5)
axs[1].set_xlabel('Time (s)')
axs[1].set_ylabel('Acceleration (ft/s**2)')
axs[1].set_title(f'Acceleration vs. Time using Kalman Smoothing for Trip {tripid}')

# Fourth
axs[2].plot(smoothed['time'], smoothed['j'], alpha=0.5)
axs[2].set_xlabel('Time (s)')
axs[2].set_ylabel('Jerk (ft/s**3)')
axs[2].set_title(f'Jerk vs. Time using Kalman Smoothing for Trip {tripid}')

plt.tight_layout()
plt.show()

In [None]:
from pykalman import KalmanFilter
import numpy.ma as ma

#convert mph to fps
kalman_trip['velocity'] = kalman_trip['speed_mph'] * 1.46667

#make a time elapsed in seconds column (and round to nearest second)
kalman_trip['time_elapsed'] = kalman_trip['datetime'].apply(lambda x: int((x - kalman_trip['datetime'].min()).total_seconds()))
    
#create nan entries to fill in missing data
fill = pd.DataFrame(data={'time_elapsed':range(0,kalman_trip['time_elapsed'].max()+1)})
fill = pd.merge(fill,kalman_trip,on='time_elapsed',how='left')

#convert our observations to numpy array
observations = fill['velocity'].to_numpy()

#use np.ma to mask missing data
observations = ma.masked_array(observations , mask=np.isnan(observations))

# the initial state of the cyclist (traveling at constant speed starting a displacement zero)
initial_state_mean = [0,0,0]

#these are the kinematics of how we're moving
transition_matrix = [[1,1,1/2], # position/diplacement
                     [0,1,1], # velocity
                     [0,0,1]] # acceleration

observation_matrix = [0,1,0] # we only measure velocity

observation_covariance = np.eye(1) * (2 * 1.46667)

#using just this estimate a kalman filter
kf1 = KalmanFilter(transition_matrices = transition_matrix,
                  observation_matrices = observation_matrix,
                  initial_state_mean = initial_state_mean,
                  observation_covariance = observation_covariance,
                  em_vars=['transition_covariance']
                  )

kf1 = kf1.em(observations, n_iter=5)
(smoothed_state_means, smoothed_state_covariances) = kf1.smooth(observations)

#convert back to dataframe
smoothed = pd.DataFrame(smoothed_state_means,columns=['x','v','a'])
smoothed['v_mph'] = smoothed['v'] / 1.46667
smoothed.reset_index(inplace=True)
smoothed.rename(columns={'index':'time'},inplace=True)
smoothed.head()

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(20, 15))

# First subplot
axs[0].plot(smoothed['time'], smoothed['x'] / 5280, label='Smoothed Displacement', alpha=0.5)
axs[0].set_xlabel('Time (s)')
axs[0].set_ylabel('Displacement (mi)')
axs[0].set_title(f'Displacement vs. Time for Trip {tripid}')

# Second subplot
axs[1].plot(kalman_trip['time_elapsed'], kalman_trip['speed_mph'], label='Measured Speed', alpha=0.5)
axs[1].plot(smoothed['time'], smoothed['v_mph'], label='Smoothed Speed', alpha=0.5)
axs[1].plot(cleaned_trip['time_elapsed'], cleaned_trip['speed_mph'], label='Calculated Speed', alpha=0.5)
axs[1].set_xlabel('Time (s)')
axs[1].set_ylabel('Speed (mph)')
axs[1].legend()
axs[1].set_title(f'Speed vs. Time for Trip {tripid}')

# Third subplot
axs[2].plot(smoothed['time'], smoothed['a'], alpha=0.5)
axs[2].set_xlabel('Time (s)')
axs[2].set_ylabel('Acceleration (ft/s**2)')
axs[2].set_title(f'Acceleration vs. Time using Kalman Smoothing for Trip {tripid}')

plt.tight_layout()
plt.show()

# Just position and velocity

In [None]:
from pykalman import KalmanFilter
import numpy.ma as ma

#convert mph to fps
kalman_trip['velocity'] = kalman_trip['speed_mph'] * 1.46667

#make a time elapsed in seconds column (and round to nearest second)
kalman_trip['time_elapsed'] = kalman_trip['datetime'].apply(lambda x: int((x - kalman_trip['datetime'].min()).total_seconds()))
    
#create nan entries to fill in missing data
fill = pd.DataFrame(data={'time_elapsed':range(0,kalman_trip['time_elapsed'].max()+1)})
fill = pd.merge(fill,kalman_trip,on='time_elapsed',how='left')

#convert our observations to numpy array
observations = fill['velocity'].to_numpy()

#use np.ma to mask missing data
observations = ma.masked_array(observations , mask=np.isnan(observations))

# the initial state of the cyclist (traveling at constant speed starting a displacement zero)
initial_state_mean = [0,0]

#these are the kinematics of how we're moving
transition_matrix = [[1,1], # position/diplacement
                     [0,1]] # velocity# acceleration

observation_matrix = [0,1] # we only measure velocity

observation_covariance = np.eye(1) * (2 * 1.46667)

#using just this estimate a kalman filter
kf1 = KalmanFilter(transition_matrices = transition_matrix,
                  observation_matrices = observation_matrix,
                  initial_state_mean = initial_state_mean,
                  observation_covariance = observation_covariance,
                  em_vars=['transition_covariance']
                  )

kf1 = kf1.em(observations, n_iter=5)
(smoothed_state_means, smoothed_state_covariances) = kf1.smooth(observations)

#convert back to dataframe
smoothed = pd.DataFrame(smoothed_state_means,columns=['x','v'])
smoothed['v_mph'] = smoothed['v'] / 1.46667
smoothed.reset_index(inplace=True)
smoothed.rename(columns={'index':'time'},inplace=True)
smoothed.head()

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(20, 15))

# First subplot
axs[0].plot(smoothed['time'], smoothed['x'] / 5280, label='Smoothed Displacement', alpha=0.5)
axs[0].set_xlabel('Time (s)')
axs[0].set_ylabel('Displacement (mi)')
axs[0].set_title(f'Displacement vs. Time for Trip {tripid}')

# Second subplot
axs[1].plot(kalman_trip['time_elapsed'], kalman_trip['speed_mph'], label='Measured Speed', alpha=0.5)
axs[1].plot(smoothed['time'], smoothed['v_mph'], label='Smoothed Speed', alpha=0.5)
axs[1].plot(cleaned_trip['time_elapsed'], cleaned_trip['speed_mph'], label='Calculated Speed', alpha=0.5)
axs[1].set_xlabel('Time (s)')
axs[1].set_ylabel('Speed (mph)')
axs[1].legend()
axs[1].set_title(f'Speed vs. Time for Trip {tripid}')

plt.tight_layout()
plt.show()

# Applying Kalman Filter for All Trips

In [None]:
def kalman(trip):

    #convert mph to fps
    kalman_trip['velocity'] = kalman_trip['speed_mph'] * 1.46667

    #make a time elapsed in seconds column
    kalman_trip['time_elapsed'] = kalman_trip['datetime'].apply(lambda x: int((x - kalman_trip['datetime'].min()).total_seconds()))
        
    #create nan entries to fill in missing data
    fill = pd.DataFrame(data={'time_elapsed':range(0,kalman_trip['time_elapsed'].max()+1)})
    fill = pd.merge(fill,kalman_trip,on='time_elapsed',how='left')

    #convert our observations to numpy array
    observations = fill['velocity'].to_numpy()

    #use np.ma to mask missing data
    observations = ma.masked_array(observations , mask=np.isnan(observations))

    # the initial state of the cyclist (traveling at constant speed)
    initial_state_mean = [0,observations[0],0,0]

    #these are the kinematics of how we're moving
    transition_matrix = [[1,1,1/2,1/6], # position/diplacement
                        [0,1,1,1/2], # velocity
                        [0,0,1,1], # acceleration
                        [0,0,0,1]] # jerk

    observation_matrix = [0,1,0,0]

    observation_covariance = np.eye(1) * np.eye(1) * (10 * 1.46667)**2

    #using just this estimate a kalman filter
    kf1 = KalmanFilter(transition_matrices = transition_matrix,
                    observation_matrices = observation_matrix,
                    initial_state_mean = initial_state_mean,
                    observation_covariance = observation_covariance,
                    em_vars=['transition_covariance']
                    )

    kf1 = kf1.em(observations, n_iter=5)
    (smoothed_state_means, smoothed_state_covariances) = kf1.smooth(observations)

    #convert back to dataframe
    smoothed = pd.DataFrame(smoothed_state_means,columns=['x','v','a','j'])
    smoothed['v_mph'] = smoothed['v'] / 1.46667
    smoothed.reset_index(inplace=True)
    smoothed.rename(columns={'index':'time'},inplace=True)

    return smoothed

In [None]:
for tripid, coords in tqdm(raw_coords_dict.items()):
    smoothed = kalman(coords)

    #get min, avg, max speeds/accelerations/jerks
    trips_df.loc[trips_df['tripid']==tripid,'speed_min'] = smoothed['v_mph'].min()
    trips_df.loc[trips_df['tripid']==tripid,'speed_mean'] = smoothed['v_mph'].mean()
    trips_df.loc[trips_df['tripid']==tripid,'speed_max'] = smoothed['v_mph'].max()

    trips_df.loc[trips_df['tripid']==tripid,'acceleration_min'] = smoothed['a'].min()
    trips_df.loc[trips_df['tripid']==tripid,'acceleration_mean'] = smoothed['a'].mean()
    trips_df.loc[trips_df['tripid']==tripid,'acceleration_max'] = smoothed['a'].max()

    trips_df.loc[trips_df['tripid']==tripid,'jerk_min'] = smoothed['j'].min()
    trips_df.loc[trips_df['tripid']==tripid,'jerk_mean'] = smoothed['j'].mean()
    trips_df.loc[trips_df['tripid']==tripid,'jerk_max'] = smoothed['j'].max()

Plot distribution of average speed

Trip data

In [None]:
trip = pd.read_csv(export_fp/"trip.csv", header = None)
col_names = ['tripid','userid','trip_type','description','starttime','endtime','num_points']
trip.columns = col_names
trip.drop(columns=['starttime','endtime','num_points'],inplace=True)

User data

In [None]:
user = pd.read_csv(export_fp/"user.csv", header=None)
user_col = ['userid','created_date','device','email','age',
            'gender','income','ethnicity','homeZIP','schoolZip',
            'workZip','cycling_freq','rider_history','rider_type','app_version']
user.columns = user_col
user.drop(columns=['device','app_version','app_version','email'],inplace=True)

Add trip and user data to trips_df

In [None]:
trips_df = pd.merge(trips_df,trip,on='tripid').merge(user,on='userid').copy()

In [None]:
fp = Path.home() / "Documents/GitHub/Impedance-Calibration"
user_data_definitions = json.load(open(fp/'user_data_definition.json'))
for col in trips_df.columns:
    if col in user_data_definitions.keys():
        trips_df[col] = trips_df[col].astype(str)
        trips_df[col] = trips_df[col].map(user_data_definitions[col])

In [None]:
user_data_definitions['cycling_freq']

Find Euclidean distance between origin and destination

In [None]:
for tripid, coords in coords_dict.items():
    #get starting location
    start_lon = coords.loc[coords['datetime'].idxmin(),'lon'].item()
    start_lat = coords.loc[coords['datetime'].idxmin(),'lat'].item()

    #get ending location
    end_lon = coords.loc[coords['datetime'].idxmax(),'lon'].item()
    end_lat = coords.loc[coords['datetime'].idxmax(),'lat'].item()

    #assign to trips_df
    trips_df.at[trips_df['tripid']==tripid,'start_lon'] = start_lon
    trips_df.at[trips_df['tripid']==tripid,'start_lat'] = start_lat
    trips_df.at[trips_df['tripid']==tripid,'end_lon'] = end_lon
    trips_df.at[trips_df['tripid']==tripid,'end_lat'] = end_lat

# find euclidean distance between start and end coord (for finding loop trips)
start_geo = gpd.points_from_xy(trips_df['start_lon'],trips_df['start_lat'],crs='epsg:4326').to_crs('epsg:2240')
end_geo = gpd.points_from_xy(trips_df['end_lon'],trips_df['end_lat'],crs='epsg:4326').to_crs('epsg:2240')
trips_df['euclidean_distance'] = start_geo.distance(end_geo)

Find origin and destination zip code

Remove loops and exercies trips

In [None]:
tolerance_ft = 1000
self_loops = trips_df['euclidean_distance'] < tolerance_ft
exercise = trips_df['trip_type'] == 'Exercise'
trips_df = trips_df[~exercise & ~self_loops]
print(self_loops.sum(),'trips were self loops')
print(exercise.sum(),'trips were for exercise')

What variables do we have for clustering?

In [None]:
trips_df.columns

In [None]:
trips_df[trips_df['rider_type'].isna()]

In [None]:
relveant_columns = ['tripid','userid','duration','total_distance_ft','avg_speed_mph','trip_type',
                    'age', 'gender', 'income', 'ethnicity','homeZIP', 'schoolZip', 'workZip',
                    'cycling_freq', 'rider_history','rider_type'
                    ]
trips_df = trips_df[relveant_columns].copy()

Replace no data with np.nan

In [None]:
trips_df.replace('no data',np.NaN,inplace=True)

In [None]:
trips_df.isna().sum().sort_values(ascending=False)

-  Continuous
    - Travel time (minutes)
    - Total distance traveled (feet)
    - Average speed (miles per hour)
    - Shortest distance (feet)




- Categorical
    - Interval
        - Age
        - Income
    - Ordinal
        - Cycling frequency
        - Rider history
        - Rider type
    - Nominal
        - Gender (M/F)
        - Ethnicity
        - Trip type (could simplify to utlitarian/leisure)

In [None]:
post = pd.read_csv(fp/'trips_df_postmatch.csv')
post = post[['tripid','shortest_length','overlap_length','detour_rate']].copy()
trips_df = trips_df.merge(post,on='tripid').copy()

In [None]:
trips_df.groupby('userid')