In [None]:
# Model the usage of MTB trails as a function of weather conditions
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import utils
import math
from sklearn.linear_model import LinearRegression
import numpy as np

In [None]:
# gather data
md = utils.get_segment_metadata()
#md['closest_ims'] = md['closest_ims'].astype(int)

In [None]:
rl_ = utils.get_ridelogs()

# Trim junk
md = md[['id', 'name', 'closest_ims']].copy()

In [None]:
d5 = rl_.copy()

In [None]:
# add the closest IMS station
d6 = d5.merge(md[['id', 'closest_ims', 'name']], how='right', left_on=['segment_id'], right_on=['id'])
#md[['name','closest_ims']]

In [None]:
rain_days = utils.get_rain_days(d6)

In [None]:
# Add rain measurements
data = d6.merge(rain_days, how='left', left_on=['closest_ims', 'date'], right_on=['closest_ims', 'date'])

# cumulative measures of rainfall

data.sort_values('date', inplace=True)

In [None]:

data['rain_7d'] = data.fillna(0).groupby('segment_id')['rain_mm'].apply(lambda x : x.rolling(7).sum())
#data['soil_moisture'] = data.groupby('segment_id')['rain_mm'].apply(utils.bathtub)
df_orig = data.copy()

In [None]:
def bathtub_set(data_, soilmodel, **kwargs):
    return soilmodel(data_['rain_mm'].values, **kwargs)

def regress(X, y):
    tofit = pd.DataFrame(data={'X' : X, 'y' : y}).dropna()
    X = tofit.X.values
    nrows = X.shape[0]
    # skip if there's too little data
    # skip if the moisture model didn't give us examples of dry soil (below 1)
    if (nrows <= 2) or (tofit.X.min() > 1):
        return {'coef' : None, 'intercept' : None, 'score' : -1}
    y = tofit.y.values
    X = X.reshape(nrows, 1)
    y = y.reshape(nrows, 1)
    try:
        reg = LinearRegression().fit(X, y)
        coef = reg.coef_[0][0]
        score = reg.score(X, y)
        # we need the correlation to be negative (more moisture = less rides)
        if coef >= 0:
            score = -1
        return {'coef' : coef, 'intercept' : reg.intercept_[0], 'score' : score}
    except ValueError:   # probably not enough data
        return {'coef' : None, 'intercept' : None, 'score' : -1}

def best_bathtub(data_):
    mydata = data_.copy()
    out = []

    # Try the geometric model
    clist = list(np.arange(1, 10, 0.5))
    clist.extend(range(10,80,4))
    dlist = list(np.arange(0.5, 1, 0.05))
    for c in clist:
        for d in dlist:
            # TODO:
            # 1. downsample the observations with no moisture and many rides
            p = regress(bathtub_set(mydata, utils.bathtub_geom_, capacity=c, drainage_factor=d), mydata['nrides'])
            # we need the correlation to be negative (more moisture = less rides)
            out.append([p['score'], {'f': 'bathtub_geom', 'capacity': c, 'drainage_factor' : d, 'coef': p['coef'], 'intercept' : p['intercept']}])
    
    #Try the basic model
    clist = list(np.arange(1, 10, 0.5))
    clist.extend(range(10,80,4))
    dlist = list(np.arange(5, 10, 0.5))
    dlist.extend(range(10,25))
    for c in clist:
        for d in dlist:
            # TODO:
            # 1. downsample the observations with no moisture and many rides
            p = regress(bathtub_set(mydata, utils.bathtub_, capacity=c, drainage=d), mydata['nrides'])
            # we need the correlation to be negative (more moisture = less rides)
            out.append([p['score'], {'f': 'bathtub', 'capacity': c, 'drainage' : d, 'coef': p['coef'], 'intercept' : p['intercept']}])
            
    cdf = pd.DataFrame(out, columns=['score', 'par'])
    idxmax = cdf['score'].idxmax()
    if(math.isnan(idxmax)):
        idxmax = 0
    return cdf.iloc[idxmax]

In [None]:
df = df_orig.copy()
out = []
for seg in df['segment_id'].unique():
    print(seg, end="...")
    mydata = df.query("segment_id == @seg")
    res = best_bathtub(mydata).to_dict()
    res['segment_id'] = seg
    out.append(res)
print("")

In [None]:
params = pd.DataFrame(out)
# Compute the days to dry
params['dtd'] = None

exploded = pd.DataFrame.from_records(params['par'])

# compute for the additive model
rows = (exploded['f'] == 'bathtub')
params.loc[rows, 'dtd'] = exploded['capacity']/exploded['drainage']

# compute for the geometric model
rows = (exploded['f'] == 'bathtub_geom')
# how many times to multiply by the factor until we reach a value of 1mm?
params.loc[rows, 'dtd'] = exploded[rows].apply(lambda r: -math.log(r['capacity'])/math.log(r['drainage_factor']), axis=1)

params = params.merge(md, how='left', left_on='segment_id', right_on='id')
params[['segment_id', 'score', 'par']].to_csv('data/segments/params.csv', index=False)

params.sort_values('score')

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(8,8))
sns.set_style('ticks')

exploded = pd.DataFrame.from_records(params['par'])
exploded['score'] = params['score']

# plot for the additive model
rows = (exploded['f'] == 'bathtub')
sns.scatterplot(data=exploded[rows], x='capacity', y='drainage', size='score', hue='score', ax=ax[0])

# plot for the geometric model
rows = (exploded['f'] == 'bathtub_geom')
sns.scatterplot(data=exploded[rows], x='capacity', y='drainage_factor', size='score', hue='score', ax=ax[1])




In [None]:
names = df['segment_id'].unique()
fig, ax = plt.subplots(figsize=(10,50), nrows=len(names), ncols=1)

for vi in range(len(names)):
    seg = names[vi]
    mydata = df.query("segment_id == @seg").copy()
    p = params.query("id == @seg")
    if len(p) > 0:
        f = p.iloc[0].par['f']
        name = mydata.iloc[0]['name']
        score = p.iloc[0]['score']
        if f == 'bathtub':
            capacity = p.iloc[0].par['capacity']
            drainage = p.iloc[0].par['drainage']
            par_str = f'c=%g d=%g' % (capacity, drainage)
            mydata['soil_moisture'] = bathtub_set(mydata, utils.bathtub_, capacity=capacity, drainage=drainage)
        elif f == 'bathtub_geom':
            capacity = p.iloc[0].par['capacity']
            drainage_factor = p.iloc[0].par['drainage_factor']
            par_str = f'c=%g d_f=%g' % (capacity, drainage_factor)
            mydata['soil_moisture'] = bathtub_set(mydata, utils.bathtub_geom_, capacity=capacity, drainage_factor=drainage_factor)
        else:
            print("Uh")
        sns.scatterplot(data=mydata,
                        y='nrides', x='soil_moisture',
                        marker='o',
                        ax=ax[vi]).set_title(f'%s score=%.2f %s' % (name, score, par_str))

In [None]:
sns.scatterplot(data = df, y ='nrides', x='rain_mm')

In [None]:
sns.lineplot(data=df[['date','nrides']].set_index('date'), ci='sd')
#plt.xticks(rotation=-45)

In [None]:
if False:
    data=by_dow.reset_index()
    #sns.barplot(data=data, hue='segment_id', x='rides_dow', y='weekday', orient='h')
    #data.plot.barh()
    data = data.pivot_table(index='weekday', columns='segment_id', values='rides_dow').apply(lambda x: x*100/sum(x), axis=0)
    data.T.plot(kind="bar", stacked=True)
    data