In [2]:
import math
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from prophet import Prophet
from datetime import datetime, timedelta

In [3]:
pd.options.plotting.backend = 'plotly'

In [4]:
def clean_fig(fig):
    fig.update_layout(
        plot_bgcolor='#FFF', title={'x': .5}, 
        font_family='Arial', title_font_family="Arial",
    )
    fig.update_coloraxes(showscale=False)
    return fig

In [5]:
check_dates = [datetime(2022, 3, 11) + timedelta(days=x) for x in range(15)]

In [6]:
np.random.seed(42)

black = '#363833'
orange = '#c86a3e'
pink = '#d38bba'
lgreen = '#c8d7ab'
dgreen = '#4d5e38'
white = '#f0ece1'

vals = [np.random.normal(1000, 50) for _ in range(10)]
idx = 6
vals[idx] += 80
upper = [np.mean(vals) + 100 * np.random.normal(1, .1) for _ in vals]
lower = [np.mean(vals) - 100 * np.random.normal(1, .1) for _ in vals]
x = np.arange(10)
fig = go.Figure([
    go.Scatter(x=x, y=upper, marker={'color': black}, mode='lines'),
    go.Scatter(x=x, y=lower, marker={'color': black}, fill='tonexty', mode='lines', fillcolor='rgba(240,236,225,20)'),
    go.Scatter(x=x, y=vals, mode='lines+markers', line={'width': 3, 'color': dgreen}, marker={'size': 10}),
    go.Scatter(
        x=[idx], y=[vals[idx]], mode='markers+text', marker={'size': 20, 'color': orange}
    )
])
fig.add_annotation(
    x=idx, y=vals[idx] + 30, text='<b>Anomaly!</b>', showarrow=False, arrowhead=2, arrowsize=2, arrowcolor='black', 
    font={'family': 'Times New Roman', 'size': 20, 'color': orange}
)
fig = clean_fig(fig)
fig.update_yaxes(showticklabels=False)
fig.update_xaxes(showticklabels=False)
fig.update_layout(showlegend=False)
fig.show()

In [7]:
def generate_df(day_map):
    df = pd.DataFrame([
        [datetime(2022, 1, 1) + timedelta(days=x), np.random.normal(loc=6000, scale=500) + x*20]
        for x in range(100)
    ], columns=['date', 'nbr_sessions'])
    df.loc[:, 'weekday'] = df.date.dt.day_name()
    for day, val in day_map.items():
        df.loc[df.weekday == day, 'nbr_sessions'] = df.loc[df.weekday == day, 'nbr_sessions'] * val
    df.loc[:, 'nbr_sessions'] = df.nbr_sessions.astype(int)
    df.loc[:, 'mean_seconds'] = np.random.normal(180, 10, len(df))
    df.loc[:, 'total_seconds'] = df.nbr_sessions * df.mean_seconds
    return df

df = generate_df({'Wednesday': .9, 'Thursday': .8, 'Saturday': 1.1, 'Sunday': 1.2})
df.head(3)

Unnamed: 0,date,nbr_sessions,weekday,mean_seconds,total_seconds
0,2022-01-01,6269,Saturday,164.493366,1031209.0
1,2022-01-02,8335,Sunday,180.68563,1506015.0
2,2022-01-03,6033,Monday,169.376963,1021851.0


# Notes
* Built from a normal distribution
* Artificially decreased one weekday, multiplying by 0.8
* Artificially increased two weekdays, multiplying by 1.1 and 1.2
* Basically impossible to spot right?

In [8]:
sdf = df[-50:].copy()
fig = go.Figure([
    go.Scatter(x=sdf.date, y=sdf.nbr_sessions, line={'color': dgreen})
])
fig.update_yaxes(range=[0, sdf.nbr_sessions.max() + 100])
fig = clean_fig(fig)
gridcolor = 'rgba(165,160,155,10)'
fig.update_xaxes(showline=True, linewidth=2, linecolor=black, gridcolor=gridcolor)
fig.update_yaxes(showline=True, linewidth=2, linecolor=black, gridcolor=gridcolor)
fig.show()

In [9]:
fig = px.box(
    df, x='weekday', y='nbr_sessions', 
    category_orders={'weekday': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']},
    color_discrete_sequence=[dgreen]
)
fig = clean_fig(fig)
fig.update_yaxes(showline=False, linewidth=2, linecolor=black, gridcolor=gridcolor)
fig.show()

In [10]:
def ema(values, span=20):
    alpha = 2 / (span + 1)
    result = []
    for i, v in enumerate(values):
        if i == 0:
            result.append(v)
        else:
            result.append(v * alpha + result[-1] * (1 - alpha))
    return result

def ema(values):
    alpha = 2 / (len(values) + 1)
    result = []
    for i, v in enumerate(values):
        if i == 0:
            result.append(v)
        else:
            result.append(v * alpha + result[-1] * (1 - alpha))
    return result[-1]

def ema_list(values):
    alpha = 2 / (len(values) + 1)
    result = []
    for i, v in enumerate(values):
        if i == 0:
            result.append(v)
        else:
            result.append(v * alpha + result[-1] * (1 - alpha))
    return result

def sma(values):
    return sum(values) / len(values)

def calc_beta(values):
    X = np.arange(len(values))

    X_ext = np.c_[np.ones(len(X)), X]
    X_gram = X_ext.T @ X_ext
    X_inv = np.linalg.inv(X_gram)
    Xy = X_ext.T @ values
    beta = X_inv @ Xy
    return beta

def ols(values):
    beta = calc_beta(values)
    # Multiplying by the length of values, sice we calculate
    # beta using np.arange(len(values)), meaning we multiply beta[1] with
    # The highest possible X value, thus giving us the prediction
    return beta[0] + beta[1] * len(values)

def std(values):
    mu = sum(values) / len(values)
    return math.sqrt(sum([(x - mu)**2 for x in values]) / len(values))

def calc_bounds(values, estimator_function, std_multiplier=2):
    e = estimator_function(values)
    s = std(values)
    return e, e - s * std_multiplier, e + s * std_multiplier

# Building a predictor for our expected value
Gonna take three approaches:
* Simple Moving Average (SMA)
* Exponential Moving Average (EMA)
* Ordinary Least Squares (OLS)

In [11]:
r = 15
br = r - 3
sample_df = df[-r:].sort_values('date').copy()
beta = calc_beta(sample_df.nbr_sessions[:br])
bline = [beta[0] + beta[1] * v for v in np.arange(br)]
fig = go.Figure([
    go.Scatter(x=np.arange(r), y=sample_df.nbr_sessions, mode='lines', line={'color': dgreen, 'width': 3}, name='True Values'),
    go.Scatter(x=np.arange(br), y=bline, mode='lines', line={'color': orange, 'width': 3}, name='OLS Line'),
    go.Scatter(x=[br - 1], y=[bline[-1]], marker={'color': orange, 'size': 12}, showlegend=False)
])
fig = clean_fig(fig)
fig.update_xaxes(showline=True, linewidth=2, linecolor=black, gridcolor=gridcolor)
fig.update_yaxes(showline=True, linewidth=2, linecolor=black, gridcolor=gridcolor)
fig.add_annotation(
    x=br - 1, y=bline[-1] + 50, text='Expected Value',
    ax=10, ay=-50,
    showarrow=True, arrowhead=2, arrowsize=2, arrowcolor=black, 
    font={'family': '<b>Times New Roman</b>', 'size': 20, 'color': orange}
)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)
fig.show()

In [12]:
dates, orig, sma_l, ema_l, ols_l = [], [], [], [], []

for d in check_dates:
    subset_df = df[df.date <= d].tail(15).copy()
    values = subset_df.nbr_sessions.values
    dates.append(d)
    orig.append(values[-1])
    sma_l.append(sma(values))
    ema_l.append(ema(values))
    ols_l.append(ols(values))

fig = go.Figure([
    go.Scatter(x=dates, y=sma_l, name='SMA', mode='lines', line={'dash': 'dash', 'width': 3, 'color': pink},),
    go.Scatter(x=dates, y=ema_l, name='EMA', mode='lines', line={'dash': 'dash', 'width': 3, 'color': dgreen}),
    go.Scatter(x=dates, y=ols_l, name='OLS', mode='lines', line={'dash': 'dash', 'width': 3, 'color': lgreen}),
    go.Scatter(x=dates, y=orig, name='Nbr Sessions', mode='lines', line={'width': 3, 'color': black}),
])
fig = clean_fig(fig)
fig.show()

In [13]:
anomaly_df = df.copy()
anomaly_date = '2022-03-17'
anomaly_df.loc[anomaly_df.date == anomaly_date, 'nbr_sessions'] -= 1500

dates, orig, lower, upper = [], [], [], []

for d in check_dates:
    subset_df = anomaly_df[anomaly_df.date <= d].tail(15).copy()
    values = subset_df.nbr_sessions.values
    dates.append(subset_df.date.tolist()[-1])
    orig.append(values[-1])
    e = ema(values)
    s = std(values)
    lower.append(e - s * 2)
    upper.append(e + s * 2)

fig = go.Figure([
    go.Scatter(x=dates, y=lower, mode='lines', marker={'color': black}, name='upper', showlegend=False),
    go.Scatter(x=dates, y=upper, mode='lines', marker={'color': black}, name='upper', fill='tonexty', fillcolor='rgba(240,236,225,20)', showlegend=False),
    go.Scatter(
        x=dates, y=orig, name='Nbr Sessions', line={'width': 4, 'color': dgreen},
        text=df[df.date.isin(dates)].weekday
        ),
])
fig = clean_fig(fig)
fig.add_annotation(
    x=anomaly_date, y=anomaly_df.loc[anomaly_df.date == anomaly_date, 'nbr_sessions'].values[0], text='<b>Anomaly!</b>',
    ax=-100, ay=-20,
    showarrow=True, arrowhead=2, arrowsize=2, arrowcolor=black, 
    font={'family': 'Times New Roman', 'size': 20, 'color': orange}
)
fig.show()

In [14]:
anomaly_df = df.copy()
anomaly_df.loc[anomaly_df.date == '2022-03-17', 'nbr_sessions'] -= 1500

dates, orig, lower, upper = [], [], [], []

for d in check_dates:
    subset_df = anomaly_df.loc[(anomaly_df.date <= d) & (anomaly_df.weekday == d.strftime('%A'))].tail(15).copy()
    values = subset_df.nbr_sessions.values
    dates.append(subset_df.date.tolist()[-1])
    orig.append(values[-1])
    e = ema(values)
    s = std(values)
    lower.append(e - s * 2)
    upper.append(e + s * 2)

fig = go.Figure([
    go.Scatter(x=dates, y=lower, mode='lines', marker={'color': black}, name='upper', showlegend=False),
    go.Scatter(x=dates, y=upper, mode='lines', marker={'color': black}, name='upper', fill='tonexty', fillcolor='rgba(240,236,225,20)', showlegend=False),
    go.Scatter(
        x=dates, y=orig, name='Nbr Sessions', line={'width': 4, 'color': dgreen},
        text=df[df.date.isin(dates)].weekday
        ),
])
anomaly_date = '2022-03-18'
fig.add_annotation(
    x=anomaly_date, y=anomaly_df.loc[anomaly_df.date == anomaly_date, 'nbr_sessions'].values[0], text='<b>Anomaly!</b>',
    ax=-100, ay=-20,
    showarrow=True, arrowhead=2, arrowsize=2, arrowcolor=black, 
    font={'family': 'Times New Roman', 'size': 20, 'color': orange}
)
fig = clean_fig(fig)
fig.show()

In [16]:
anomaly_df = df[df.date.isin(check_dates)].copy()
anomaly_df.loc[anomaly_df.date == '2022-03-17', 'nbr_sessions'] -= 1500
anomaly_df.loc[anomaly_df.date == '2022-03-17', 'total_seconds'] -= 1500 * 180
anomaly_df.loc[:, 'total_minutes'] = anomaly_df.total_seconds / 60

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(
    x=anomaly_df.date, y=anomaly_df.nbr_sessions, mode='lines', line={'color': dgreen, 'width': 3}, name='Nbr Sessions'
    ), secondary_y=False)
fig.add_trace(go.Scatter(
    x=anomaly_df.date, y=anomaly_df.total_minutes, mode='lines', line={'color': orange, 'width': 3}, name='Total Minutes'
    ), secondary_y=True)
fig = clean_fig(fig)
fig.show()

In [17]:
anomaly_df = df[df.date.isin(check_dates)].copy()
anomaly_df.loc[anomaly_df.date == '2022-03-17', 'nbr_sessions'] -= 1500

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(
    x=anomaly_df.date, y=anomaly_df.nbr_sessions, mode='lines', line={'color': dgreen, 'width': 3}, name='Nbr Sessions'
    ), secondary_y=False)
fig.add_trace(go.Scatter(
    x=anomaly_df.date, y=anomaly_df.mean_seconds, mode='lines', line={'color': orange, 'width': 3}, name='Mean Seconds'
    ), secondary_y=True)
fig = clean_fig(fig)
fig.show()

In [18]:
anomaly_df = df.copy()

dates, orig, lower, upper = [], [], [], []

for d in check_dates:
    subset_df = anomaly_df[anomaly_df.date <= d].tail(15).copy()
    values = subset_df.mean_seconds.values
    dates.append(subset_df.date.tolist()[-1])
    orig.append(values[-1])
    e = ema(values)
    s = std(values)
    lower.append(e - s * 2)
    upper.append(e + s * 2)

fig = go.Figure([
    go.Scatter(x=dates, y=lower, mode='lines', marker={'color': black}, name='upper', showlegend=False),
    go.Scatter(x=dates, y=upper, mode='lines', marker={'color': black}, name='upper', fill='tonexty', fillcolor='rgba(240,236,225,20)', showlegend=False),
    go.Scatter(
        x=dates, y=orig, name='Mean Seconds per User', line={'width': 4, 'color': dgreen},
        text=df[df.date.isin(dates)].weekday
        ),
])
fig = clean_fig(fig)
fig.show()

In [19]:
dates, orig, lower, upper = [], [], [], []


for d in check_dates:
    subset_df = (
        anomaly_df[anomaly_df.date <= d]
        .tail(45)
        .copy()
        .rename(columns={'date': 'ds', 'nbr_sessions': 'y'})
    )
    m = Prophet(daily_seasonality=False, yearly_seasonality=False)
    m.fit(subset_df[:-1])
    pred = m.predict(subset_df[-1:][['ds']])
    dates.append(d)
    orig.append(subset_df[-1:].y.values[0])
    lower.append(pred.yhat_lower.values[0])
    upper.append(pred.yhat_upper.values[0])

DEBUG:cmdstanpy:input tempfile: /tmp/tmp9s_smn_g/vqjs28mh.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp9s_smn_g/zetiucb3.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.7/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=67267', 'data', 'file=/tmp/tmp9s_smn_g/vqjs28mh.json', 'init=/tmp/tmp9s_smn_g/zetiucb3.json', 'output', 'file=/tmp/tmp9s_smn_g/prophet_model7n1awyre/prophet_model-20221103115645.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
11:56:45 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:56:45 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmp9s_smn_g/p_t1gkp8.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp9s_smn_g/8_f6jlcc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

In [25]:
pred[['yhat', 'yhat_lower', 'yhat_upper']]

Unnamed: 0,yhat,yhat_lower,yhat_upper
0,7746.833133,7197.334023,8267.534829


In [26]:
fig = go.Figure([
    go.Scatter(x=dates, y=lower, mode='lines', marker={'color': black}, name='upper', showlegend=False),
    go.Scatter(x=dates, y=upper, mode='lines', marker={'color': black}, name='upper', fill='tonexty', fillcolor='rgba(240,236,225,20)', showlegend=False),
    go.Scatter(
        x=dates, y=orig, name='Nbr Sessions', line={'width': 4, 'color': dgreen},
        text=df[df.date.isin(dates)].weekday
        ),
])
fig = clean_fig(fig)
fig.show()

In [34]:
forecast_df = df.rename(columns={'date': 'ds', 'nbr_sessions': 'y'}).copy()
m = Prophet()
m.fit(forecast_df[:-15])
pred_df = m.predict(forecast_df[-15:])

vis_df = forecast_df[-30:].copy()

fig = go.Figure([
    go.Scatter(x=vis_df.ds, y=vis_df.y, mode='lines', line={'color': dgreen, 'width': 3}, name='True Values'),
    go.Scatter(x=pred_df.ds, y=pred_df.yhat, mode='lines', line={'color': orange, 'width': 3, 'dash': 'dash'}, name='Prophet Forecast')
])
fig = clean_fig(fig)
fig.show()

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmp9s_smn_g/3ox6nyk9.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp9s_smn_g/blyf57qj.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.7/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=89990', 'data', 'file=/tmp/tmp9s_smn_g/3ox6nyk9.json', 'init=/tmp/tmp9s_smn_g/blyf57qj.json', 'output', 'file=/tmp/tmp9s_smn_g/prophet_modelpj4387nz/prophet_model-20221103131621.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
13:16:21 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
13:16:21 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
