In [None]:
%reload_ext autoreload
%autoreload 2

# All the packages are defined in the Config file
from model_packages import *

# All the variables are defined in the Config file
from model_config import *

# All the functions are defined in the Config file
from model_utils import *



In [None]:
# Preprocess Strava Metro data
# Get hourly means for weekdays in non-winter months

#os.getcwd()

# we have made a local copy of the Strava data
'''
The data is for a specific 
location with hourly 
frequency count of activities
'''

file_list=os.listdir(strava_data_loc_monthly)



csv_file=[x for x in file_list if x.endswith(".csv")][0]

shp_file=[x for x in file_list if x.endswith(".shp")][0]

# Shape files and Strava edge ids.
strava_df_shp = gpd.read_file(strava_data_loc_monthly+'/'+shp_file)

strava_df_shp=strava_df_shp.drop_duplicates(subset='edgeUID').reset_index(drop=True)

# Strava activity count
strava_df_csv = pd.read_csv(strava_data_loc_monthly+'/'+csv_file)

strava_df_shp=strava_df_shp.to_crs(crs_mtr)

In [None]:


# Get a sample of Strava activities: (need to find a way around- to speed up computation)

strava_df_csv_sample=strava_df_csv.copy()


#strava_df_csv_sample.date=pd.to_datetime(strava_df_csv_sample.date)

strava_df_csv_sample.month=pd.to_datetime(strava_df_csv_sample.month)

strava_df_csv_sample['year']=strava_df_csv_sample.month.dt.year


strava_df_csv_sample['month']=strava_df_csv_sample.month.dt.month



In [None]:

strava_df_csv_sample['month_name']=strava_df_csv_sample['month'].apply(lambda x: calendar.month_abbr[x])


In [None]:
# Visualisation
strava_df_csv_sample.groupby(['month','year'])['total_trip_count'].sum().unstack().plot(style='-o')
plt.ylabel('Activities recorded')

In [None]:
# Focus on specific days and months


strava_df_csv_sample_summer=strava_df_csv_sample[strava_df_csv_sample.month_name.isin(['Jun','Jul','Aug'])]

In [None]:
strava_df_csv_sample_summer['month_name'].unique()

In [None]:
# Visualisation
strava_df_csv_sample_summer.groupby(['month','year'])['total_trip_count'].sum().unstack().plot()
plt.ylabel('Activities recorded')

In [None]:
# Mean activity count of each edge grouped by time of the day

mean_count=strava_df_csv_sample_summer.groupby(['month','osm_reference_id'])['total_trip_count'].sum().\
reset_index()



mean_count_pvt_tbl=mean_count.pivot_table(index=["osm_reference_id"],columns='month',values='total_trip_count')


mean_count_pvt_tbl=mean_count_pvt_tbl.fillna(0)

mean_count_pvt_tbl#=mean_count_pvt_tbl.reindex(columns=['Morning', 'Noon', 'Evening', 'Night'])

In [None]:
# Focus on edges which are not in the extrema

x=pd.DataFrame(mean_count_pvt_tbl.sum(axis=1))

x.rename(columns={0:'total_mean_edge_count'},inplace=True)

# Print the new shape of the DataFrame
print("Old Shape: ", x.shape)

''' Detection '''
# IQR
# Calculate the upper and lower limits
Q1 = x['total_mean_edge_count'].quantile(0.1)
Q3 = x['total_mean_edge_count'].quantile(0.9)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR


# Create arrays of Boolean values indicating the outlier rows
upper_array = np.where(x['total_mean_edge_count']>=upper)[0]
lower_array = np.where(x['total_mean_edge_count']<=lower)[0]



x=x[(~(x.index.isin(x.iloc[upper_array,:].index.values))&(~(x.index.isin(x.iloc[lower_array,:].index.values))))]

#x.hist()

sns.displot(x['total_mean_edge_count'],kde=True)


 
# Print the new shape of the DataFrame
print("New Shape: ", x.shape)

In [None]:
mean_count_pvt_tbl=x.copy()


In [None]:
# Calculate the pairwise distance between edges and 

ds = dtw.distance_matrix_fast(mean_count_pvt_tbl.values,window=1,compact=False)


In [None]:
# Based on the pairwise distance, assign each edge to a cluster
'''
This is computationally costly
'''
# https://stackoverflow.com/questions/50695226/how-to-get-the-optimal-number-of-clusters-using-hierarchical-cluster-analysis-au

# costly
#cluster = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='ward',\
#                                  compute_full_tree=True, distance_threshold=10000)

# racplus
# https://towardsdatascience.com/scaling-agglomerative-clustering-for-big-data-an-introduction-to-rac-fb26a6b326ad

cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward',\
                                  compute_full_tree=True, distance_threshold=None)

#cluster = racplusplus.rac(n_clusters=5,distance_threshold=None, symmetric, batch_size=1000, no_cores=8, metric="cosine")


# https://hdbscan.readthedocs.io/en/latest/performance_and_scalability.html
#cluster= HDBSCAN()
# Cluster the data
cluster.fit_predict(ds)

print(f"Number of clusters = {1+np.amax(cluster.labels_)}")

# Display the clustering, assigning cluster label to every datapoint 
print("Classifying the points into clusters:")
print(cluster.labels_)

# Display the clustering graphically in a plot
plt.scatter(ds[:,0],ds[:,1], c=cluster.labels_, cmap='rainbow')
plt.title(f"SK Learn estimated number of clusters = {1+np.amax(cluster.labels_)}")
plt.show()

print(" ")

In [None]:

# Assign cluster label to each edge
df_ds=pd.DataFrame(ds)

df_ds.index=mean_count_pvt_tbl.index.values

df_ds['labels']=cluster.labels_

df_ds=df_ds.reset_index()

df_ds.rename(columns={'index':'osm_reference_id'},inplace=True)


count_edges_clusters=df_ds.merge(mean_count_pvt_tbl.reset_index(),left_on=['osm_reference_id'],\
                                 right_on=['osm_reference_id'])


count_edges_clusters=count_edges_clusters[['osm_reference_id','labels','total_mean_edge_count']]




In [None]:

# Strava edges with cluster labels

df_ds_shp=pd.merge(strava_df_shp.drop_duplicates(subset=['osmId']),df_ds,left_on=['osmId'],\
                   right_on=['osm_reference_id'],how='inner')

df_ds_shp=gpd.GeoDataFrame(df_ds_shp)[['geometry','labels','osm_reference_id']]


df_ds_shp['labels_count']=df_ds_shp['labels'].map(df_ds_shp['labels'].value_counts())

In [None]:
count_edges_clusters=df_ds_shp.merge(count_edges_clusters,left_on=['osm_reference_id','labels'],\
                right_on=['osm_reference_id','labels'])

In [None]:

sns.boxplot(data=count_edges_clusters,x="labels",y="total_mean_edge_count")

In [None]:
count_edges_clusters['labels'].value_counts()

In [None]:
# Strava edges color coded with cluster labels
ax = count_edges_clusters.to_crs(crs_deg).plot(column="labels", categorical=True,cmap='viridis',\
                                               legend=True, figsize=(9, 9))
cx.add_basemap(ax,crs=count_edges_clusters.to_crs(crs_deg).crs.to_string(),zoom=16)

In [None]:
# We might want to focus on those edges with the most non-extreme grouping

# (i.e. cluster frequency count not in the top/bottom)

# Z-score of 1.0 would indicate a value that is one standard deviation from the mean.

z_scr=1
count_edges_clusters['labels_count'].hist()

df_ds_shp[(np.abs(zscore(count_edges_clusters['labels_count'])) <= z_scr)]['labels_count'].hist()

plt.show()


df_ds_shp_non_extreme_edges=count_edges_clusters[(np.abs(zscore(count_edges_clusters['labels_count'])) <= z_scr)]

df_ds_shp_non_extreme_edges['labels'].unique()

In [None]:


ax = df_ds_shp_non_extreme_edges.to_crs(crs_deg).plot(column="labels", categorical=True,cmap='viridis',legend=True, figsize=(9, 9))
cx.add_basemap(ax,crs=df_ds_shp_non_extreme_edges.to_crs(crs_deg).crs.to_string(),zoom=16)
ax.set_title('Streets clusters',fontsize=1)
plt.tight_layout()
