In [17]:
import pandas as pd
import time
import datetime
import glob as mod_glob
import json
import seaborn as sns
import matplotlib.pyplot as plt
import re
import math


# Analyse segment statistics

In [129]:
# Read meta-data for all segments
datadir = 'data/'
segfile = 'segments/segments.csv'

md = pd.read_csv(datadir + segfile)
md.drop(columns='Unnamed: 0', inplace=True)
#md.set_index('id', inplace=True)
md['id'] = md['id'].map(str)

rl_ = None

# Read usage records
ridelog_files = mod_glob.glob(datadir + 'ridelogs/' + r"*.json")
for rf in ridelog_files:
    print(rf)
    jdata = []
    with open(rf) as ridelog:
        for line in ridelog:
            jdata.append(json.loads(line))
    if(rl_ is None):
        rl_ = pd.DataFrame(jdata)
    else:
        rl_ = pd.concat([rl_, pd.DataFrame(jdata)], ignore_index=True)
rl_['date'] = pd.to_datetime(rl_['time_retrieved'], unit='s').dt.date

data/ridelogs/segments-20201124.json
data/ridelogs/202012.json
data/ridelogs/202011.json


In [130]:
# Get historical average user per day
c = 'created_at'
md[c] = pd.to_datetime(md[c])
c = 'time_retrieved'
md[c] = pd.to_datetime(md[c], utc=True)
md['hist_length'] = md['time_retrieved'] - md['created_at']
md['hist_length_days'] = md['hist_length'].apply(lambda x: x.days)
md['weekly_avg'] = 7*md['effort_count'] / md['hist_length_days']

md_meta = md[['id', 'name', 'distance']]
md[['id', 'name', 'weekly_avg']]

Unnamed: 0,id,name,weekly_avg
0,3808938,קפוצינו,42.46046
1,1248017,העלייה מרון,29.435212
2,4267589,מה עכשיו טיפוס,46.550512
3,18952377,שימרי,53.400507
4,2481821,מעלה רמי,70.917339
5,7774409,fun in the forest,128.915908
6,8574425,החילזון נהיה עצלן,49.897025
7,17421855,ברמים בירידה,101.636459
8,4202076,פאמפ טראק ריש לקיש,49.958379
9,1717839,סוללים צפון מערבה,28.956835


In [131]:
# Add the distribution across days of the week

tf = pd.read_csv('data/trailforks.csv')
# fix string dates
tf['date_orig'] = tf['date']
tf['date'] = pd.to_datetime(tf['date'].apply(lambda s : re.sub('(.*[ap]m).*', '\\1', s)),
                          format='%b %d, %Y @ %I:%M%p', errors='raise')

tf['weekday'] = tf['date'].dt.weekday
wdist = tf['weekday'].value_counts(normalize=True)
wdist = pd.DataFrame(data = {'weekday_weight' : wdist, 'weekday' : wdist.index.values})

In [132]:
# Apply weekly distribution to per-trail usage
left = md
right=wdist
# cross join
md = left.assign(key=1).merge(right.assign(key=1), on='key').drop('key', 1)
md['daily_avg'] = md['weekly_avg'] * md['weekday_weight']
#md['id'] = md['id'].map(str)


In [133]:
mdmd = md[['id', 'name', 'weekday', 'daily_avg']]
#mdmd.query('id == "3808938"')
mdmd

Unnamed: 0,id,name,weekday,daily_avg
0,3808938,קפוצינו,3,12.810687
1,3808938,קפוצינו,4,11.168781
2,3808938,קפוצינו,5,6.701269
3,3808938,קפוצינו,1,4.925719
4,3808938,קפוצינו,0,2.443767
...,...,...,...,...
72,17443790,Last Climb Carmel Cup,5,20.099278
73,17443790,Last Climb Carmel Cup,1,14.773829
74,17443790,Last Climb Carmel Cup,0,7.329651
75,17443790,Last Climb Carmel Cup,6,7.043337


In [134]:
# Tabulate ridelog data with date as index
rl2 = pd.pivot_table(rl_, index='date', values='effort_count', columns='segment_id')
rl2.set_index(pd.DatetimeIndex(rl2.index.values), inplace=True)

In [135]:
# resample daily, interpolate missing values, and diff against the previous day
daily = rl2.resample('1D').interpolate().fillna(method='bfill').diff()


In [136]:
daily

segment_id,1248017,1717839,17421855,17443790,18952377,2481821,3808938,4202076,4267589,7774409,8574425
2020-11-24,,,,,,,,,,,
2020-11-25,13.0,8.0,15.0,11.0,5.0,37.0,3.0,27.0,36.0,54.0,15.0
2020-11-26,0.0,0.0,0.0,2.0,0.0,-1.0,1.0,-1.0,1.0,-2.0,1.0
2020-11-27,1.0,1.0,6.0,16.0,0.0,27.0,20.0,0.0,4.0,16.0,4.0
2020-11-28,3.0,1.0,5.0,16.0,6.0,46.0,85.0,4.0,13.0,68.0,53.0
2020-11-29,4.0,0.0,3.0,1.0,6.0,26.0,12.0,6.0,9.0,36.0,3.0
2020-11-30,13.0,7.0,5.0,10.0,3.0,32.0,9.0,8.0,15.0,33.0,30.0
2020-12-01,7.0,2.0,13.0,5.0,0.0,16.0,24.0,12.0,17.0,22.0,0.0
2020-12-02,13.0,8.0,27.0,4.0,5.0,31.0,20.0,11.0,14.0,34.0,6.0
2020-12-03,5.0,12.0,18.0,10.0,2.0,18.0,5.0,9.0,10.0,23.0,9.0


In [137]:
# add the daily average in each segment

# first, convert the table to long format
all_segs = daily.columns
d2 = daily.reset_index()
d3 = d2.melt(id_vars = 'index', value_vars=all_segs)

d4 = d3.rename(columns = {'index' : 'date'})
d4['weekday'] = d4['date'].dt.weekday

#print(d4.dtypes)
#print(md.dtypes)
# tack on the daily averages
md_short = md[['id', 'weekday', 'daily_avg']]
d5 = d4.merge(md_short, how='left', left_on=['segment_id', 'weekday'], right_on=['id','weekday'])

d5

Unnamed: 0,date,segment_id,value,weekday,id,daily_avg
0,2020-11-24,1248017,,1,1248017,3.414696
1,2020-11-25,1248017,13.0,2,1248017,1.429408
2,2020-11-26,1248017,0.0,3,1248017,8.880857
3,2020-11-27,1248017,1.0,4,1248017,7.742625
4,2020-11-28,1248017,3.0,5,1248017,4.645575
...,...,...,...,...,...,...
116,2020-11-30,8574425,30.0,0,8574425,2.871771
117,2020-12-01,8574425,0.0,1,8574425,5.788414
118,2020-12-02,8574425,6.0,2,8574425,2.423057
119,2020-12-03,8574425,9.0,3,8574425,15.054363


In [138]:

# compute usage relative to the daily average
d5['relative_usage'] = d5['value'] / d5['daily_avg']

d6 = d5[['date', 'segment_id', 'relative_usage']].copy()


# re-tabulate
d7 = pd.pivot_table(d6, index='date', values='relative_usage', columns='segment_id')

#pd.to_datetime(d7.index.values).weekday
d8 = d7.clip(lower=0, upper=1)
d8

segment_id,1248017,1717839,17421855,17443790,18952377,2481821,3808938,4202076,4267589,7774409,8574425
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-11-25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-11-26,0.0,0.0,0.0,0.052052,0.0,0.0,0.07806,0.0,0.071201,0.0,0.066426
2020-11-27,0.129155,0.131289,0.22443,0.477629,0.0,1.0,1.0,0.0,0.326674,0.471837,0.304764
2020-11-28,0.645776,0.218815,0.311708,0.796048,0.711924,1.0,1.0,0.507317,1.0,1.0,1.0
2020-11-29,1.0,0.0,0.533705,0.141978,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-11-30,1.0,1.0,0.854762,1.0,0.976114,1.0,1.0,1.0,1.0,1.0,1.0
2020-12-01,1.0,0.59538,1.0,0.338436,0.0,1.0,1.0,1.0,1.0,1.0,0.0
2020-12-02,1.0,1.0,1.0,0.646789,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-12-03,0.563009,1.0,0.586996,0.260258,0.124136,0.841264,0.390299,0.597099,0.712013,0.591335,0.597833
2020-12-04,1.0,1.0,1.0,0.208963,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
from matplotlib.dates import DateFormatter
sns.lineplot(data=d8, dashes=False, marker='o')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

plt.gca().xaxis.set_major_formatter(DateFormatter("%m-%d"))
plt.xticks(rotation=45)
None

In [140]:
d8.mean(axis=1)

date
2020-11-25    1.000000
2020-11-26    0.024340
2020-11-27    0.369616
2020-11-28    0.744690
2020-11-29    0.788698
2020-11-30    0.984625
2020-12-01    0.721256
2020-12-02    0.967890
2020-12-03    0.569477
2020-12-04    0.928088
dtype: float64

In [150]:
def link2(a, id):
    return f'<a href="https://www.strava.com/segments/{a}">{id}</a>'

rideability_color = lambda x: '<div style="background-color: {}">{}</div>'.format(('Chartreuse' if x>80 else 'DarkOrange' if x>30 else 'OrangeRed'), x)

# prepare for display as nice HTML
# get the last row
dfout = pd.DataFrame(d8.iloc[-1])
# format the date
dateout = dfout.columns[0].strftime('%d/%m/%Y')
dfout.rename(columns = { dfout.columns[0] : 'rideability'}, inplace=True)
# add the segment names and URLs
dfout = dfout.merge(md_meta, how='inner', left_on='segment_id', right_on='id')
dfout['link'] = dfout.apply(lambda x: link2(x['id'], x['name']), axis=1)
dfout['distance'] = dfout['distance'].map(lambda x : "מטר %.0f" % x)
dfout.drop(columns=['id', 'name'], inplace=True)
dfout['rideability'] = dfout['rideability'].map(lambda x : math.floor(100*x))
# re-order columns
dfout = dfout[['rideability', 'link', 'distance']]

htmlout = dfout.to_html(formatters={'rideability': rideability_color},
                        render_links=True, escape=False, index=False, border=1)

dfout.rename(columns = {'rideability' : 'מדד רכיבות', 'link' : 'מקטע', 'distance' : 'אורך'}, inplace=True)

fileout = "data/out/rides.html"
with open(fileout, "w", encoding="utf-8") as file:
    file.write(htmlout)

dfout

Unnamed: 0,מדד רכיבות,מקטע,אורך
0,100,"<a href=""https://www.strava.com/segments/12480...",מטר 799
1,100,"<a href=""https://www.strava.com/segments/17178...",מטר 3103
2,100,"<a href=""https://www.strava.com/segments/17421...",מטר 578
3,20,"<a href=""https://www.strava.com/segments/17443...",מטר 334
4,100,"<a href=""https://www.strava.com/segments/18952...",מטר 1354
5,100,"<a href=""https://www.strava.com/segments/24818...",מטר 1664
6,100,"<a href=""https://www.strava.com/segments/38089...",מטר 259
7,100,"<a href=""https://www.strava.com/segments/42020...",מטר 1285
8,100,"<a href=""https://www.strava.com/segments/42675...",מטר 758
9,100,"<a href=""https://www.strava.com/segments/77744...",מטר 948
