In [None]:
# Model the usage of MTB trails as a function of weather conditions
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import utils
import math
from sklearn.linear_model import LinearRegression

In [None]:
# gather data
md = utils.get_segment_metadata()
#md['closest_ims'] = md['closest_ims'].astype(int)

In [None]:
rl_ = utils.get_ridelogs()

# Trim junk
md = md[['id', 'name', 'closest_ims']].copy()

In [None]:
# Tabulate ridelog data with date as index
rl2 = pd.pivot_table(rl_, index='date', values='effort_count', columns='segment_id')
rl2.set_index(pd.DatetimeIndex(rl2.index.values), inplace=True)

# resample daily, interpolate missing values, and diff against the previous day
daily = rl2.resample('1D').interpolate().diff()
# negative values might come up if Strava removes rides
daily.clip(lower=0, inplace=True)


In [None]:
# normalize by day-of-week average
all_segs = daily.columns
d2 = daily.reset_index()
d3 = d2.melt(id_vars = 'index', value_vars=all_segs)
d4 = d3.rename(columns = {'index' : 'date', 'value' : 'rides'})
all_rides = pd.DataFrame(d4.groupby('date')['rides'].sum()).reset_index()
all_rides['segment_id'] = 'ALL'
#all_rides['closest_ims'] = '000'

d4 = d4.append(all_rides)

d4['weekday'] = d4['date'].dt.weekday

In [None]:
by_dow = d4.groupby(['segment_id', 'weekday']).mean().rename(columns={'rides' : 'rides_dow'})
d5 = d4.merge(by_dow, how='left', left_on=['segment_id', 'weekday'], right_on=['segment_id', 'weekday'])
# normalize (nrides = normalized rides)
d5['nrides'] = d5['rides'] / d5['rides_dow']

# negative values might come up if Strava removes rides
# positive values which are too high are not useful for the analysis
d5['nrides'].clip(lower=0, upper=1.5, inplace=True)

In [None]:
# add the closest IMS station
d6 = d5.merge(md[['id', 'closest_ims', 'name']], how='right', left_on=['segment_id'], right_on=['id'])

In [None]:

#d6['closest_ims'] = d6['closest_ims'].astype(int)
#d6['closest_ims'].fillna('000')
#d6['closest_ims'] = d6['closest_ims'].as_type(str)
rain_days = utils.get_rain_days(d6)
#d6

In [None]:
# Add rain measurements
data = d6.merge(rain_days, how='left', left_on=['closest_ims', 'date'], right_on=['closest_ims', 'date'])

# cumulative measures of rainfall

data.sort_values('date', inplace=True)
data['rain_7d'] = data.groupby('segment_id')['rain_mm'].apply(lambda x : x.rolling(7).sum()).fillna(0)
#data['soil_moisture'] = data.groupby('segment_id')['rain_mm'].apply(utils.bathtub)
df_orig = data.copy()

In [None]:
#data = data_orig.query("segment_id == '7774409'").copy()

In [None]:
#data.groupby('segment_id').sum().sort_values('rides')
#df_orig

In [None]:
def bathtub_set(data_, capacity, drainage):
     return utils.bathtub_(data_['rain_mm'].values, capacity=capacity, drainage=drainage)

def regress(X, y):
    tofit = pd.DataFrame(data={'X' : X, 'y' : y}).dropna()
    X = tofit.X.values
    nrows = X.shape[0]
    y = tofit.y.values
    X = X.reshape(nrows, 1)
    y = y.reshape(nrows, 1)
    try:
        reg = LinearRegression().fit(X, y)
        return {'coef' : reg.coef_[0][0], 'intercept' : reg.intercept_[0], 'score' : reg.score(X, y)}
    except ValueError:   # probably not enough data
        return {'coef' : None, 'intercept' : None, 'score' : -1}

def best_bathtub(data_):
    mydata = data_.copy()
    out = []

    for c in range(0,80,4):
        for d in range(0,25):
            # TODO:
            # 1. downsample the observations with no moisture and many rides
            p = regress(bathtub_set(mydata, c, d), mydata['nrides'])
            out.append([c, d, p['coef'], p['intercept'], p['score']])
        
    cdf = pd.DataFrame(out, columns=['capacity','drainage', 'coef', 'intercept', 'score'])
    idxmax = cdf['score'].idxmax()
    if(math.isnan(idxmax)):
        idxmax = 0
    return cdf.iloc[idxmax]

In [None]:
df = df_orig.copy()
out = []
for seg in df['segment_id'].unique():
    mydata = df.query("segment_id == @seg")
    res = best_bathtub(mydata).to_dict()
    res['segment_id'] = seg
    out.append(res)

In [None]:
params = pd.DataFrame(out)
# Compute the moisture value needed to reach nrides=1
params['x1'] = -params['intercept']/params['coef']
# Maximum days to reach this moisture value
params['dfactor_lr'] = (params['capacity'] - params['x1'])/params['drainage']
params['dfactor'] = params['capacity']/params['drainage']
params = params.merge(md, how='left', left_on='segment_id', right_on='id')
params[['capacity', 'drainage', 'score', 'segment_id']].to_csv('data/segments/params.csv', index=False)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.set_style('ticks')
sns.scatterplot(data=params, x='capacity', y='drainage', size='score', hue='score', ax=ax)


In [None]:

names = df['segment_id'].unique()
fig, ax = plt.subplots(figsize=(10,50), nrows=len(names), ncols=1)

for vi in range(len(names)):
    seg = names[vi]
    mydata = df.query("segment_id == @seg").copy()
    p = params.query("id == @seg")
    capacity = p.iloc[0].capacity
    drainage = p.iloc[0].drainage
    score = p.iloc[0]['score']
    name = mydata.iloc[0]['name']
    mydata['soil_moisture'] = bathtub_set(mydata, capacity, drainage)
    sns.scatterplot(data=mydata,
                    y='nrides', x='soil_moisture',
                    hue='segment_id', marker='o',
                    ax=ax[vi]).set_title(f'%s c=%d d=%d score=%.2f' % (name, capacity, drainage, score))

In [None]:
seg = '5230474' # 'ALL'
mydata = df.query("segment_id == @seg")
#mydata



In [None]:
sns.scatterplot(data = mydata, y ='nrides', x='rain_mm')

In [None]:
sns.lineplot(data=mydata[['date','nrides']].set_index('date'))

In [None]:
data=by_dow.reset_index()
#sns.barplot(data=data, hue='segment_id', x='rides_dow', y='weekday', orient='h')
#data.plot.barh()
data = data.pivot_table(index='weekday', columns='segment_id', values='rides_dow').apply(lambda x: x*100/sum(x), axis=0)
data.T.plot(kind="bar", stacked=True)
data