In [None]:
# Model the usage of MTB trails as a function of weather conditions
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import utils
import math
from sklearn.linear_model import LinearRegression, Ridge
import numpy as np
from datetime import date, timedelta

In [None]:
# gather data
md = utils.get_segment_metadata()
# ignore inactive segments
md = md[md['active_modeling']]

#md['closest_ims'] = md['closest_ims'].astype(int)

In [None]:
rl_ = utils.get_ridelogs()

# Trim junk
md = md[['id', 'name', 'closest_ims']].copy()

In [None]:
d5 = rl_.copy()

In [None]:
# add the closest IMS station
d6 = d5.merge(md[['id', 'closest_ims', 'name']], how='right', left_on=['segment_id'], right_on=['id'])
#md[['name','closest_ims']]

In [None]:
weather_days = utils.get_weather_days(d6)

In [None]:
# Add rain measurements
data = d6.merge(weather_days, how='left', left_on=['closest_ims', 'date'], right_on=['closest_ims', 'date'])

# cumulative measures of rainfall

data.sort_values('date', inplace=True)

In [None]:
# add lockdown value
data['lockdown'] = 0
data.loc[(data['date'] > '2021-01-07'), 'lockdown'] = 1

In [None]:
data['rain_7d'] = data.fillna(0).groupby('segment_id')['rain_mm'].apply(lambda x : x.rolling(7).sum().clip(lower=0))
#data['soil_moisture'] = data.groupby('segment_id')['rain_mm'].apply(utils.bathtub)
df_orig = data.copy()

In [None]:
def bathtub_set(data_, soilmodel, **kwargs):
    soil = np.array(soilmodel(data_[['rain_mm', 'wind_ms']], **kwargs)).reshape(-1, 1)
    rain = (np.array(data_['rain_mm']) > 5).astype(int)    # indictor for: was there any rain on this particular day?
    rain = rain.reshape(-1, 1)
    lockdown = np.array(data_['lockdown']).reshape(-1, 1)
    X = np.concatenate((soil, rain, lockdown), axis=1)
    cmap = ['soil', 'rain', 'lockdown']
    return (X, cmap)

def regress(X, y):
    # remove NaNs. We do this by stuffing everything into a dataframe first
    dfXy = pd.DataFrame(X)
    dfXy['y'] = y.values
    dfXy.dropna(inplace=True)
    # now unpack
    Xy = np.array(dfXy.values)
    X = Xy[:,:-1]
    y = Xy[:, -1].reshape(-1, 1)

    nrows = X.shape[0]
    # skip if there's too little data
    # skip if the moisture model didn't give us examples of dry soil (below 1)
    if (nrows <= 2) or (X[:, 0].min() > 1):
        return {'coef' : None, 'intercept' : None, 'score' : -1}
    # We typically get many many examples with X=0 (dry soil), down-weigh them
    Xsoil = X[:, 0].reshape(-1)
    weights = np.ones(Xsoil.shape)
    nzeros = np.count_nonzero(Xsoil == 0)
    if nzeros > 0:
        # we collapse all the zero points to have a total weight of one
        weights[Xsoil == 0] = 1/nzeros
    try:
        reg = Ridge(alpha=1E-8, normalize=True).fit(X, y, sample_weight = weights)
        #reg = LinearRegression(normalize=True).fit(X, y, sample_weight = weights)
        coef = reg.coef_[0]
        score = reg.score(X, y, sample_weight = weights)
        # apply some sanity checks
        # we need more soil moisture = less rides, not the opposite
        if coef[0] > 0:
            score = -1
        # we need more rain today = less rides, not the opposite
        if coef[1] > 0:
            score = -1
        # we need more lockdown today = less rides, not the opposite
        if coef[2] > 0:
            score = -1
        return {'coef' : reg.coef_[0], 'intercept' : reg.intercept_[0], 'score' : score}
    except ValueError:   # probably not enough data
        return {'coef' : None, 'intercept' : None, 'score' : -1}

def best_bathtub(data_):
    mydata = data_.copy()
    out = []

    # Try the geometric model
    clist = list(np.arange(4, 80, 4))
    #clist = list(np.arange(4, 50, 8))

    dlist = list(np.arange(0.5, 0.65, 0.05))
    dlist.extend(np.arange(0.65, 0.8, 0.05))
    dlist.extend(np.arange(0.8, 1, 0.05))

    wlist = list(np.arange(0, 3, 0.25))
    #wlist = [0, 1]
    #wlist = []

    for c in clist:
        for d in dlist:
            for w in wlist:
                X, cmap = bathtub_set(mydata, utils.bathtub_geom_, capacity=c, drainage_factor=d, fwind=w)
                p = regress(X, mydata['nrides'])
                if p['score'] > -1:
                    outdict = {'f': 'bathtub_geom', 'capacity': c, 'drainage_factor' : d, 'fwind' : w,
                               'intercept' : p['intercept']}
                    for ci in range(len(cmap)):
                        outdict['c_' + cmap[ci]] = p['coef'][ci]
                    out.append([p['score'], outdict])
    
    #Try the basic model
    clist = list(np.arange(1, 10, 0.5))
    clist.extend(range(10,80,4))
    #clist = range(10, 30, 5)
    
    dlist = list(np.arange(5, 10, 0.5))
    dlist.extend(range(10,25))
    
    wlist = list(np.arange(0, 3, 0.25))
    #wlist = [0, 1]
    
    for c in clist:
        for d in dlist:
            for w in wlist:
                # we want to ensure some margin between capacity and drainage
                if c > d + 2:
                    X, cmap = bathtub_set(mydata, utils.bathtub_, capacity=c, drainage=d, fwind=w)
                    p = regress(X, mydata['nrides'])
                    if p['score'] > -1:
                        outdict = {'f': 'bathtub', 'capacity': c, 'drainage' : d, 'fwind' : w,
                                             'intercept' : p['intercept']}
                        for ci in range(len(cmap)):
                            outdict['c_' + cmap[ci]] = p['coef'][ci]
                        out.append([p['score'], outdict])

    if(len(out) > 0):
        cdf = pd.DataFrame(out, columns=['score', 'par'])
        idxmax = cdf['score'].idxmax()
        if(math.isnan(idxmax)):
            idxmax = 0
        return cdf.iloc[idxmax]
    else:
        return None

In [None]:
df = df_orig.copy()
out = []
seglist = df['segment_id'].unique()
#seglist = ['17421855']
for seg in seglist:
    print(seg, end="...")
    mydata = df.query("segment_id == @seg")
    res = best_bathtub(mydata)
    if res is not None:
        res = res.to_dict()
        res['segment_id'] = seg
        out.append(res)
print("")

In [None]:
params = pd.DataFrame(out)
# Compute the days to dry
params['dtd'] = None

exploded = pd.DataFrame.from_records(params['par'])
params = pd.concat([params, exploded], axis='columns')

# check we have all needed columns
for c in ['drainage', 'drainage_factor']:
    if c not in params.columns:
        params[c] = None


# We compute the moisture value which yields 0.9 of the intercept (soil is 90% dry)
params['y90'] = 0.9*exploded['intercept']
params['x90'] = (params['y90'] - exploded['intercept'])/exploded['c_soil']

# compute for the additive model
rows = (params['f'] == 'bathtub')
params.loc[rows, 'dtd'] = (params['capacity'] - params['x90'])/params['drainage']

# compute for the geometric model
rows = (params['f'] == 'bathtub_geom')
# how many times to multiply by the factor until we reach a value of x90?
params.loc[rows, 'dtd'] = params[rows].apply(lambda r: (math.log(r['y90']) -math.log(r['capacity']))/math.log(r['drainage_factor']), axis=1)

params = params.merge(md, how='left', left_on='segment_id', right_on='id').drop(columns='id')
params[['segment_id', 'score', 'par']].to_csv('data/segments/params.csv', float_format='%.3g', index=False)

params[['name', 'segment_id', 'dtd', 'score', 'closest_ims', 'capacity', 'drainage', 'drainage_factor', 'fwind', 'c_lockdown']].sort_values('dtd')

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(8,8))
sns.set_style('ticks')

exploded = pd.DataFrame.from_records(params['par'])
exploded['score'] = params['score']

# plot for the additive model
rows = (exploded['f'] == 'bathtub')
if rows.sum() > 0:
    sns.scatterplot(data=exploded[rows], x='capacity', y='drainage', size='score', hue='fwind', ax=ax[0])

# plot for the geometric model
rows = (exploded['f'] == 'bathtub_geom')
if rows.sum() > 0:
    sns.scatterplot(data=exploded[rows], x='capacity', y='drainage_factor', size='score', hue='fwind', ax=ax[1])


In [None]:
names = df['segment_id'].unique()
fig, ax = plt.subplots(figsize=(10,50), nrows=len(names), ncols=1)

for vi in range(len(names)):
    seg = names[vi]
    mydata = df.query("segment_id == @seg").copy()
    p = params.query("segment_id == @seg")
    if len(p) > 0:
        f = p.iloc[0].par['f']
        name = mydata.iloc[0]['name']
        score = p.iloc[0]['score']
        capacity = p.iloc[0].par['capacity']
        fwind = p.iloc[0].par['fwind']

        if f == 'bathtub':
            drainage = p.iloc[0].par['drainage']
            par_str = f'c=%g d=%g w=%g' % (capacity, drainage, fwind)
            X, cmap = bathtub_set(mydata, utils.bathtub_, capacity=capacity,
                                  drainage=drainage, fwind=fwind)
            mydata['soil_moisture'] = X[:, 0]
        elif f == 'bathtub_geom':
            drainage_factor = p.iloc[0].par['drainage_factor']
            fwind = p.iloc[0].par['fwind']
            par_str = f'c=%g d_f=%g w=%g' % (capacity, drainage_factor, fwind)
            X, cmap = bathtub_set(mydata, utils.bathtub_geom_, capacity=capacity,
                                  drainage_factor=drainage_factor, fwind=fwind)
            mydata['soil_moisture'] = X[:, 0]
        else:
            print("Uh")
        sns.scatterplot(data=mydata,
                        y='nrides', x='soil_moisture',
                        marker='o',
                        ax=ax[vi]).set_title(f'%s score=%.2f %s' % (name, score, par_str))

In [None]:
sns.scatterplot(data = df, y ='nrides', x='rain_mm')

In [None]:
mydf = utils.get_ridelogs()

today = date.today()
epoch = today - timedelta(weeks=2)
fig, ax = plt.subplots(2, 1, figsize=(8,8))

sns.lineplot(data=mydf[['date','nrides_raw']].query("date >= @epoch").set_index('date'), ci='sd', ax=ax[0])
#plt.xticks(rotation=-45)

sns.lineplot(data=mydf[['date','rides']].query("date >= @epoch").set_index('date'), ci='sd', ax=ax[1])
plt.xticks(rotation=-45)

None

In [None]:
if False:
    data=by_dow.reset_index()
    #sns.barplot(data=data, hue='segment_id', x='rides_dow', y='weekday', orient='h')
    #data.plot.barh()
    data = data.pivot_table(index='weekday', columns='segment_id', values='rides_dow').apply(lambda x: x*100/sum(x), axis=0)
    data.T.plot(kind="bar", stacked=True)
    data

In [None]:
epoch = today - timedelta(days=1)
df.query("date > @epoch").sort_values('nrides_raw')