In [1]:
# Model the usage of MTB trails as a function of weather conditions
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import utils


In [2]:
# gather data
md = utils.get_segment_metadata()
rl_ = utils.get_ridelogs()

# Trip junk
md = md[['id', 'name', 'closest_ims']].copy()

data/ridelogs/segments-20201124.json
data/ridelogs/202012.json
data/ridelogs/202011.json


In [3]:
# Tabulate ridelog data with date as index
rl2 = pd.pivot_table(rl_, index='date', values='effort_count', columns='segment_id')
rl2.set_index(pd.DatetimeIndex(rl2.index.values), inplace=True)

# resample daily, interpolate missing values, and diff against the previous day
daily = rl2.resample('1D').interpolate().fillna(method='bfill').diff()

In [None]:
# add the rainfall data

# first, convert the table to long format
all_segs = daily.columns
d2 = daily.reset_index()
d3 = d2.melt(id_vars = 'index', value_vars=all_segs)

d4 = d3.rename(columns = {'index' : 'date', 'value' : 'rides'})

# add the closest IMS station
d5 = d4.merge(md[['id', 'closest_ims']], how='left', left_on=['segment_id'], right_on=['id'])

rain_days = utils.get_rain_days(d5)

In [6]:
# Add rain measurements
data = d5.merge(rain_days, how='left', left_on=['closest_ims', 'date'], right_on=['closest_ims', 'date'])

In [26]:
# Add rain measurements
data = d5.merge(rain_days, how='left', left_on=['closest_ims', 'date'], right_on=['closest_ims', 'date'])

# cumulative measures of rainfall

data.sort_values('date', inplace=True)
data['rain_7d'] = data.groupby('segment_id')['rain_mm'].apply(lambda x : x.rolling(7).sum()).fillna(0)
data['soil_moisture'] = data.groupby('segment_id')['rain_mm'].apply(utils.bathtub)
df_orig = data.copy()

In [8]:
#data = data_orig.query("segment_id == '7774409'").copy()

In [9]:
#data.groupby('segment_id').sum().sort_values('rides')

In [39]:
def bathtub_set(data, capacity, drainage):
     data['soil_moisture'] = data.groupby('segment_id')['rain_mm'].apply(bathtub, capacity=capacity, drainage=drainage)

def bathtub_set2(mydata, capacity, drainage):
    mydata['soil_moisture'] = utils.bathtub_(mydata['rain_mm'], capacity=capacity, drainage=drainage)

def best_bathtub(data_):
    mydata = data_.copy()
    print(mydata.columns)
    out = []

    for c in range(0,80,4):
        for d in range(0,25):
            bathtub_set2(mydata, c, d)
            corr, _ = spearmanr(mydata['rides'].values, mydata['soil_moisture'].values, nan_policy='omit')
            out.append([c, d, corr])
        
    cdf = pd.DataFrame(out, columns=['capacity','drainage', 'corr'])
    cdf['abscorr'] = cdf['corr'].transform('abs')
    print(cdf.iloc[cdf['abscorr'].idxmax()])
    return cdf.iloc[cdf['abscorr'].idxmax()]

In [None]:
df = df_orig.copy()
#data_orig['segment_id'].unique()
df.groupby('segment_id').agg(best_bathtub)


In [41]:
#df.groupby('segment_id').sum()

In [None]:
best_bathtub(data)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.set_style('ticks')
sns.scatterplot(data=cdf, x='capacity', y='drainage', size='negcorr', hue='abscorr', ax=ax)


In [None]:
v = [[12, 7], [4, 3], [10, 2], [20, 4]]

fig, ax = plt.subplots(figsize=(10,40), nrows=len(v), ncols=1)

for vi in range(len(v)):
    vv = v[vi]
    capacity = vv[0]
    drainage = vv[1]
    bathtub_set(capacity, drainage)
    sns.scatterplot(data=data,
                    y='rides', x='soil_moisture',
                    hue='segment_id', marker='o',
                    ax=ax[vi]).set_title(f'c=%d d=%d' % (capacity, drainage))
