In [1]:
import pandas as pd
import time
import datetime
import json
import re
import math
import ims
import utils

# Analyse segment statistics and generate an HTML table for public consumption

In [2]:
# gather data
md = utils.get_segment_metadata()
rl_ = utils.get_ridelogs()

data/ridelogs/segments-20201124.json
data/ridelogs/202012.json
data/ridelogs/202011.json


In [3]:
# Add closest climate station 

#TODO: cache the result and store back in segments file
from stations import closest_station

def find_closest(x):
    xl = list(map(float, x.strip('][').split(', ')))
    return(closest_station(xl[0], xl[1]))

#md_meta.drop(columns='closest', inplace=True)
md['closest_ims'] = None
# fill closest station 
md['closest_ims'] = md.apply(lambda r : find_closest(r['start_latlng']) if pd.isnull(r['closest_ims']) else r['closest'], axis=1)

# save a table aside
md_meta = md[['id', 'name', 'distance', 'region_name', 'region_url', 'closest_ims']].copy()

In [4]:
# Get historical average user per day, based on the number of days the segments exists, and the total effort counts
# BUGBUG: Strava also counts efforts performed before the creation date
c = 'created_at'
md[c] = pd.to_datetime(md[c])
c = 'time_retrieved'
md[c] = pd.to_datetime(md[c], utc=True)
md['hist_length'] = md['time_retrieved'] - md['created_at']
md['hist_length_days'] = md['hist_length'].apply(lambda x: x.days)
md['weekly_avg'] = 7*md['effort_count'] / md['hist_length_days']

#md[['id', 'name', 'weekly_avg']]

In [5]:
# Add the distribution across days of the week

tf = pd.read_csv('data/trailforks.csv')
# fix string dates
tf['date_orig'] = tf['date']
tf['date'] = pd.to_datetime(tf['date'].apply(lambda s : re.sub('(.*[ap]m).*', '\\1', s)),
                          format='%b %d, %Y @ %I:%M%p', errors='raise')

tf['weekday'] = tf['date'].dt.weekday
wdist = tf['weekday'].value_counts(normalize=True)
wdist = pd.DataFrame(data = {'weekday_weight' : wdist, 'weekday' : wdist.index.values})

In [6]:
# Apply weekly distribution to per-trail usage
left = md
right=wdist
# cross join
md = left.assign(key=1).merge(right.assign(key=1), on='key').drop('key', 1)
md['daily_avg'] = md['weekly_avg'] * md['weekday_weight']
#md['id'] = md['id'].map(str)


In [7]:
#mdmd = md[['id', 'name', 'weekday', 'daily_avg']]
#md_meta.query('id == "17443790"')
#mdmd

In [8]:
# Tabulate ridelog data with date as index
rl2 = pd.pivot_table(rl_, index='date', values='effort_count', columns='segment_id')
rl2.set_index(pd.DatetimeIndex(rl2.index.values), inplace=True)

In [9]:
# resample daily, interpolate missing values, and diff against the previous day
daily = rl2.resample('1D').interpolate().fillna(method='bfill').diff()


In [10]:
# add the daily average in each segment

# first, convert the table to long format
all_segs = daily.columns
d2 = daily.reset_index()
d3 = d2.melt(id_vars = 'index', value_vars=all_segs)

d4 = d3.rename(columns = {'index' : 'date'})
d4['weekday'] = d4['date'].dt.weekday

#print(d4.dtypes)
#print(md.dtypes)
# tack on the daily averages
md_short = md[['id', 'weekday', 'daily_avg', 'closest_ims']]
d5 = d4.merge(md_short, how='left', left_on=['segment_id', 'weekday'], right_on=['id','weekday'])

# compute usage relative to the daily average
d5['relative_usage'] = d5['value'] / d5['daily_avg']

In [11]:
rain_days = utils.get_rain_days(d5)

42##2020/12/01
42##2020/11/30
42##2020/12/06
42##2020/12/05
42##2020/11/26
42##2020/11/25
42##2020/11/27
42##2020/11/26
42##2020/11/28
42##2020/11/27
42##2020/11/29
42##2020/11/28
42##2020/11/30
42##2020/11/29
42##2020/12/02
42##2020/12/01
42##2020/12/03
42##2020/12/02
42##2020/12/04
42##2020/12/03
42##2020/12/05
42##2020/12/04


In [12]:
d6 = d5[['date', 'segment_id', 'relative_usage']].copy()

# Add rain measurements
segments_with_stations = d6.merge(md_meta, how='left', left_on='segment_id', right_on='id')
segments_with_stations.drop(columns='id', inplace=True)
segments_with_rainfall = segments_with_stations.merge(rain_days, how='left', left_on=['closest_ims', 'date'], right_on=['closest_ims', 'date'])

d6 = segments_with_rainfall.copy()


In [14]:
# add rainfall weekly running sum
d6.sort_values('date', inplace=True)
d6['rain_7d'] = d6.groupby('segment_id')['rain_mm'].apply(lambda x : x.rolling(7).sum()).fillna(0)

# Add soil moisture
d6['soil_moisture'] = d6.groupby('segment_id')['rain_mm'].apply(utils.bathtub)

In [15]:
# trim to just most recent day
lastdate = d6['date'].max()
d7 = d6.query("date == @lastdate").copy()

# re-tabulate
#d7 = pd.pivot_table(d6, index='date', values='relative_usage', columns='segment_id')

#pd.to_datetime(d7.index.values).weekday
ru = d7.loc[:, 'relative_usage'].copy()
ru.clip(lower=0, upper=1, inplace=True)
d7.loc[:, 'relative_usage'] = ru.values

In [16]:
if False:
    from matplotlib.dates import DateFormatter
    sns.lineplot(data=d8, dashes=False, marker='o')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    plt.gca().xaxis.set_major_formatter(DateFormatter("%m-%d"))
    plt.xticks(rotation=45)
    None

In [17]:
# prepare for display as nice HTML
def link2(a, id):
    return f'<a href="{a}">{id}</a>'

rideability_color = lambda x: '<div style="background-color: {}">{}</div>'.format(('Chartreuse' if x>80 else 'DarkOrange' if x>30 else 'OrangeRed'), x)

d8 = d7
dfout = d8

In [19]:
#format the date
dateout = lastdate.strftime('%d/%m/%Y')
dfout.rename(columns = { 'relative_usage' : 'rideability'}, inplace=True)

dfout['link'] = dfout.apply(lambda x: link2(f"https://www.strava.com/segments/{x['segment_id']}", x['name']), axis=1)
dfout['region_link'] = dfout.apply(lambda x: link2(x['region_url'], x['region_name']), axis=1)
dfout['distance'] = dfout['distance'].map(lambda x : "%.0f" % x)

dfout.drop(columns=['date', 'segment_id', 'name'], inplace=True)
dfout['rideability'] = dfout['rideability'].map(lambda x : math.floor(100*x))
dfout['rain_mm'] = dfout['rain_mm'].map(lambda x : "%.1f" % x)
dfout['rain_7d'] = dfout['rain_7d'].map(lambda x : "%.0f" % x)
                                        
# re-order columns
dfout = dfout[['rideability', 'link', 'distance', 'region_link', 'rain_mm', 'rain_7d']].copy()

dfout.rename(columns = {'rideability' : 'מדד רכיבות', 'link' : 'מקטע', 'distance' : 'אורך (מטר)', 'region_link' : 'איזור', 'rain_mm' : 'גשם יומי מ״מ', 'rain_7d' : 'גשם מצטבר שבועי מ״מ'},
             inplace=True)

htmlout = dfout.to_html(formatters={'מדד רכיבות': rideability_color},
                        render_links=True, classes="table",
                        escape=False, index=False, border=1)


# Add decorations and save to file

title = 'מדד רכיבות'
update_ts = '<br>' +  "עדכון אחרון: {}".format(dateout) + '</br>\n'

with open('preamble.txt') as f:
    preamble = " ".join([l.rstrip() for l in f]) 

with open('epilog.txt') as f:
    epilog = "\n".join([l.rstrip() for l in f]) 

html_preamble = '<html><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1">\n<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">\n<title>' + title + '</title>\n</head><body dir=rtl>\n' + preamble + "\n" + update_ts + '<div class="container">\n'
htmlout = html_preamble + htmlout + "</div>\n" + epilog

fileout = "data/out/rides.html"

with open(fileout, "w", encoding="utf-8") as file:
    file.write(htmlout)