In [None]:
%load_ext autoreload
%autoreload 2

# All the packages are defined in the Config file
from model_packages import *

# All the variables are defined in the Config file
from model_config import *

# All the functions are defined in the Config file
from model_utils import *

In [None]:
# Preprocess Strava Metro data
# Get hourly means for weekdays in non-winter months

#os.getcwd()

# we have made a local copy of the Strava data
'''
The data is for a specific 
location with hourly 
frequency count of activities
'''

file_list=os.listdir(strava_data_loc_hourly)

csv_file=[x for x in file_list if x.endswith(".csv")][0]

shp_file=[x for x in file_list if x.endswith(".shp")][0]

# Shape files and Strava edge ids.
strava_df_shp = gpd.read_file(strava_data_loc_hourly+'/'+shp_file)

strava_df_shp=strava_df_shp.drop_duplicates(subset='edgeUID').reset_index(drop=True)

# Strava activity count
strava_df_csv = pd.read_csv(strava_data_loc_hourly+'/'+csv_file)

strava_df_shp=strava_df_shp.to_crs(crs_mtr)

In [None]:
strava_df_csv.shape

In [None]:
# Get a sample of Strava activities: (need to find a way around- to speed up computation)

strava_df_csv_sample=strava_df_csv.sample(500000)
strava_df_csv_sample.hour=pd.to_datetime(strava_df_csv_sample.hour)



# format Strava dates, get weekdays and season
strava_df_csv_sample['date'] =strava_df_csv_sample.hour.dt.date

strava_df_csv_sample['day'] =strava_df_csv_sample.hour.dt.day

strava_df_csv_sample['day_name']=strava_df_csv_sample.hour.dt.day_name()


strava_df_csv_sample['month_name']=strava_df_csv_sample.hour.dt.month_name()


strava_df_csv_sample['hour'] =strava_df_csv_sample.hour.dt.hour

strava_df_csv_sample['hour']=strava_df_csv_sample['hour'].apply(lambda y :get_time_day(y))

In [None]:
# Visualisation
strava_df_csv_sample.groupby(['hour','day_name'])['total_trip_count'].sum().unstack()\
.reindex(['Morning', 'Noon', 'Evening', 'Night']).reindex(columns=[ 'Monday',  'Tuesday','Wednesday',\
                                                                   'Thursday', 'Friday','Saturday', 'Sunday']).\
plot()
plt.ylabel('Activities recorded')

In [None]:
# Focus on specific days and months

strava_df_csv_sample_weekday=strava_df_csv_sample[~strava_df_csv_sample.day_name.isin(['Saturday','Sunday'])]

strava_df_csv_sample_weekday_summer=strava_df_csv_sample_weekday[strava_df_csv_sample_weekday.\
                                                                 month_name.isin(['June','July','August'])]

In [None]:
# Visualisation
strava_df_csv_sample_weekday_summer.groupby(['hour','month_name'])['total_trip_count'].sum().unstack().\
reindex(['Morning', 'Noon', 'Evening', 'Night']).reindex(columns=[ 'June','July','August']).plot()
plt.ylabel('Activities recorded')

In [None]:
# Mean activity count of each edge grouped by time of the day

mean_count=strava_df_csv_sample_weekday_summer.groupby(['hour','osm_reference_id'])['total_trip_count'].sum().\
reset_index()



mean_count_pvt_tbl=mean_count.pivot_table(index=["osm_reference_id"],columns='hour',values='total_trip_count')


mean_count_pvt_tbl=mean_count_pvt_tbl.fillna(0)

mean_count_pvt_tbl=mean_count_pvt_tbl.reindex(columns=['Morning', 'Noon', 'Evening', 'Night'])

In [None]:
# Focus on edges which are not in the extrema

x=pd.DataFrame(mean_count_pvt_tbl.sum(axis=1))

x.rename(columns={0:'total_mean_edge_count'},inplace=True)

# Print the new shape of the DataFrame
print("Old Shape: ", x.shape)

''' Detection '''
# IQR
# Calculate the upper and lower limits
Q1 = x['total_mean_edge_count'].quantile(0.1)
Q3 = x['total_mean_edge_count'].quantile(0.9)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR


# Create arrays of Boolean values indicating the outlier rows
upper_array = np.where(x['total_mean_edge_count']>=upper)[0]
lower_array = np.where(x['total_mean_edge_count']<=lower)[0]



x=x[(~(x.index.isin(x.iloc[upper_array,:].index.values))&(~(x.index.isin(x.iloc[lower_array,:].index.values))))]

#x.hist()

sns.displot(x['total_mean_edge_count'],kde=True)


 
# Print the new shape of the DataFrame
print("New Shape: ", x.shape)

In [None]:
# Calculate the pairwise distance between edges and 

ds = dtw.distance_matrix_fast(mean_count_pvt_tbl.values,window=1,compact=False)


In [None]:
# Based on the pairwise distance, assign each edge to a cluster
'''
This is computationally costly
'''
# https://stackoverflow.com/questions/50695226/how-to-get-the-optimal-number-of-clusters-using-hierarchical-cluster-analysis-au

cluster = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='ward',\
                                  compute_full_tree=True, distance_threshold=10000)

# Cluster the data
cluster.fit_predict(ds)

print(f"Number of clusters = {1+np.amax(cluster.labels_)}")

# Display the clustering, assigning cluster label to every datapoint 
print("Classifying the points into clusters:")
print(cluster.labels_)

# Display the clustering graphically in a plot
plt.scatter(ds[:,0],ds[:,100], c=cluster.labels_, cmap='rainbow')
plt.title(f"SK Learn estimated number of clusters = {1+np.amax(cluster.labels_)}")
plt.show()

print(" ")

In [None]:

# Assign cluster label to each edge
df_ds=pd.DataFrame(ds)

df_ds.index=mean_count_pvt_tbl.index.values

df_ds['labels']=cluster.labels_

df_ds=df_ds.reset_index()

df_ds.rename(columns={'index':'osm_reference_id'},inplace=True)


df_ds

In [None]:

# Strava edges with cluster labels

df_ds_shp=pd.merge(strava_df_shp.drop_duplicates(subset=['osmId']),df_ds,left_on=['osmId'],\
                   right_on=['osm_reference_id'],how='inner')

df_ds_shp=gpd.GeoDataFrame(df_ds_shp)[['geometry','labels']]


df_ds_shp['labels_count']=df_ds_shp['labels'].map(df_ds_shp['labels'].value_counts())

In [None]:
# Strava edges color coded with cluster labels
ax = df_ds_shp.to_crs(crs_deg).plot(column="labels", cmap='viridis',legend=True, figsize=(9, 9))
cx.add_basemap(ax,crs=df_ds_shp.to_crs(crs_deg).crs.to_string())

In [None]:
# We might want to focus on those edges with the most non-extreme grouping

# (i.e. cluster frequency count not in the top/bottom)

# Z-score of 1.0 would indicate a value that is one standard deviation from the mean.
df_ds_shp['labels_count'].hist()

df_ds_shp[(np.abs(zscore(df_ds_shp['labels_count'])) <= 1)]['labels_count'].hist()

plt.show()


df_ds_shp_non_extreme_edges=df_ds_shp[(np.abs(zscore(df_ds_shp['labels_count'])) <= 1)]

In [None]:
# Strava edges color coded with cluster labels
ax = df_ds_shp_non_extreme_edges.to_crs(crs_deg).plot(column="labels", cmap='viridis',legend=True, figsize=(9, 9))
cx.add_basemap(ax,crs=df_ds_shp_non_extreme_edges.to_crs(crs_deg).crs.to_string())