In this notebook, we use clustering techniques to categorize street segments into classes of temporal patterns of Strava activities. We used continuous signal processing to group sample of street segments in National Parks across England into distinct classes of temporal activities that varied based on overall volume and daily patterns. Findings from this work can use this data to strategically place counters across these trails to efficiently capture visitation counts that better represent the visitation counts in protected landscapes.


**Based on the following work**

https://findingspress.org/article/10828-where-to-put-bike-counters-stratifying-bicycling-patterns-in-the-city-using-crowdsourced-data



In [None]:
%reload_ext autoreload
%autoreload 2

# All the packages are defined in the Config file
from model_packages import *

# All the variables are defined in the Config file
from model_config import *

# All the functions are defined in the Config file
from model_utils import *





**Gather Strava pedestrian activity data for a sample of trails across different National Parks**

In [None]:
# Preprocess Strava Metro data
'''
The data is for specific 
locations with daily
frequency count of activities
'''

folder_list=[name for name in os.listdir(strava_data_loc_np) if os.path.isdir(os.path.join(strava_data_loc_np, name)) ]


str_csv=[]

str_shp=[]
for folder in folder_list:
    
    file_list=os.listdir(strava_data_loc_np+folder)
    print(folder)
    
    csv_file=[x for x in file_list if x.endswith(".csv")][0]
    
    shp_file=[x for x in file_list if x.endswith(".shp")][0]
    
    # Shape files and Strava edge ids.
    strava_df_shp = gpd.read_file(strava_data_loc_np+folder+'/'+shp_file)
    
    strava_df_shp=strava_df_shp.drop_duplicates(subset='edgeUID').reset_index(drop=True)
    
    # Strava activity count
    strava_df_csv = pd.read_csv(strava_data_loc_np+folder+'/'+csv_file)
    strava_df_shp=strava_df_shp.to_crs(crs_mtr)
    
    str_csv.append(strava_df_csv)
    str_shp.append(strava_df_shp)
    
    
#store all trails shapefiles    
strava_df_shp=pd.concat(str_shp).drop_duplicates(subset=['edgeUID']).reset_index(drop=True) 

#store corresponding activities
    
strava_df_csv=pd.concat(str_csv).drop_duplicates().reset_index(drop=True)

strava_df_csv['date']=pd.to_datetime(strava_df_csv['date'])

strava_df_csv['day_of_week'] = strava_df_csv['date'].dt.day_name()


strava_df_csv['month']=strava_df_csv['date'].dt.month

strava_df_csv['year']=strava_df_csv['date'].dt.year


strava_df_csv['month_name']=strava_df_csv['month'].apply(lambda x: calendar.month_abbr[x])

#Get seasons information

strava_df_csv['season'] = strava_df_csv.month.apply(get_season)

**Visualisations**

In [None]:
# Get % Total recorded Strava activities for each day of the week and across different seasons

cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
cat_type = CategoricalDtype(categories=cats, ordered=True)
strava_df_csv['day_of_week'] = strava_df_csv['day_of_week'].astype(cat_type)

df_agg_day_sesn=((strava_df_csv.groupby(['season','day_of_week'])['total_trip_count'].sum()/\
                  strava_df_csv.groupby(['season','day_of_week'])['total_trip_count'].sum().sum(axis=0))*100).\
reset_index()

fig = px.bar(df_agg_day_sesn, x='day_of_week', y="total_trip_count", color="season",barmode='group')

fig.update_traces(marker=dict(size=15,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_layout(yaxis_title="% Total recorded Strava activities")

fig.show()


**Get data for National Parks** 


In [None]:
# Strava trails across National Parks
strava_df_shp.to_crs(crs_deg).explore()

In [None]:
area_of_interest=gpd.read_file(national_park_data)
area_of_interest.explore(categorical=True,column='name',legend=True)

In [None]:
# Get Strava trails intersecting with National Parks
strava_intrsct_ar_intrst=gpd.overlay(strava_df_shp.to_crs(crs_mtr),area_of_interest.to_crs(crs_mtr),\
                                     how='intersection')

strava_intrsct_ar_intrst.to_crs(crs_deg).explore(categorical=True,column='name',legend=True)

In [None]:
#Get Strava edges of interest

edges_nature_rsrv=strava_intrsct_ar_intrst['edgeUID'].unique()

#Get Strava data of interest

strava_df_csv=strava_df_csv[strava_df_csv['edge_uid'].isin(edges_nature_rsrv)]

#Get Strava shapefiles of interest

strava_df_shp=strava_df_shp[strava_df_shp['edgeUID'].isin(edges_nature_rsrv)]

In [None]:
# Visualisation
strava_df_csv.groupby(['month','year'])['total_trip_count'].sum().unstack().plot(style='-o')
plt.title('Total number of trips recorded across all edges')



**Focusing on the activities for specific season (to control for seasonal effects)** 

In [None]:
# Focus on a specific season

strava_df_csv_summer=strava_df_csv[strava_df_csv.month_name.isin(['Jun','Jul','Aug'])]

In [None]:
# Visualisation
strava_df_csv_summer.groupby(['month','year'])['total_trip_count'].sum().unstack().plot(style='-o')
#plt.ylabel('Activities recorded')
plt.title('Total number of trips recorded across all edges during Summer')

In [None]:
# Mean activity count of each edge grouped by month

mean_count=strava_df_csv_summer.groupby(['month','edge_uid'])['total_trip_count'].mean().\
reset_index()

#re-format the data

mean_count_pvt_tbl=mean_count.pivot_table(index=["edge_uid"],columns='month',values='total_trip_count')

# Mean number of activities recorded for each edge for each month in Summer
mean_count_pvt_tbl=mean_count_pvt_tbl.fillna(0)


#Distribution of activities
sns.displot(pd.DataFrame(mean_count_pvt_tbl.sum(axis=1)).values,kde=True,legend=False)



**Perform clustering on trails to identify trails with similar level of activities** 

In [None]:
# Calculate the pairwise distance between edges 
# (based on mean number of activities recorded for each edge)

ds = dtw.distance_matrix_fast(mean_count_pvt_tbl.values,window=1,compact=False)


In [None]:
# Based on the pairwise distance, assign each edge to a cluster


numbr_clustrs_range=range (2,15)
str_chs=[]
for numb_clusters in numbr_clustrs_range:
    
    cluster = AgglomerativeClustering(n_clusters=numb_clusters, affinity='euclidean', linkage='ward',\
                                  compute_full_tree=True, distance_threshold=None)
    cluster.fit_predict(ds)

    print(f"Number of clusters = {1+np.amax(cluster.labels_)}")

    metrics.calinski_harabasz_score(ds, cluster.labels_)
    
    str_chs.append(metrics.calinski_harabasz_score(ds, cluster.labels_))

    
plt.plot(numbr_clustrs_range,str_chs,'-o')




In [None]:
from scipy.signal import argrelextrema


# Find the optimal 
# for local maxima
local_max=argrelextrema(np.array(str_chs), np.greater)

optml_clstr=numbr_clustrs_range[local_max[0][0]]


In [None]:


cluster = AgglomerativeClustering(n_clusters=optml_clstr, affinity='euclidean', linkage='ward',\
                                  compute_full_tree=True, distance_threshold=None)

cluster.fit_predict(ds)

print(f"Number of clusters = {1+np.amax(cluster.labels_)}")

In [None]:

# Assign cluster label to each edge
# Distance matrix- showing distance bewteeen every Strava edge
df_ds=pd.DataFrame(ds)

df_ds.index=mean_count_pvt_tbl.index.values

df_ds['total_mean_edge_count']=df_ds.mean(axis=1).values

df_ds['labels']=cluster.labels_

df_ds=df_ds.reset_index()

df_ds.rename(columns={'index':'edge_uid'},inplace=True)


count_edges_clusters=df_ds[['edge_uid','labels','total_mean_edge_count']].merge(mean_count_pvt_tbl.reset_index(),left_on=['edge_uid'],\
                                 right_on=['edge_uid'])


count_edges_clusters=count_edges_clusters[['edge_uid','labels','total_mean_edge_count']]





In [None]:

# Strava edges with cluster labels

df_ds_shp=pd.merge(strava_df_shp.drop_duplicates(subset=['edgeUID']),df_ds,left_on=['edgeUID'],\
                   right_on=['edge_uid'],how='inner')

df_ds_shp=gpd.GeoDataFrame(df_ds_shp)[['geometry','labels','edge_uid']]


df_ds_shp['labels_count']=df_ds_shp['labels'].map(df_ds_shp['labels'].value_counts())

In [None]:
count_edges_clusters=df_ds_shp.merge(count_edges_clusters,left_on=['edge_uid','labels'],\
                right_on=['edge_uid','labels'])

In [None]:

sns.boxplot(data=count_edges_clusters,x="labels",y="total_mean_edge_count")

In [None]:

sns.violinplot(data=count_edges_clusters,x="labels",y="total_mean_edge_count")

In [None]:
count_edges_clusters.to_crs(crs_deg).explore(column="labels", categorical=True,cmap='viridis',\
                                               legend=True)