# About bucketing

This notebook explains why we need bucketing and proposes one approach to have responsive smartboards while keeping a good understanding of the data 

In [None]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go

## Defining time interval

In [None]:
_t1 = datetime.strptime("2022-09-14T09:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ")
_t2 = datetime.strptime("2022-10-14T09:22:22.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ")

## Defining time sampling interval

In [None]:
deltat = _t2 - _t1
# deltat.total_seconds()

frequency = 0.5

deltat.total_seconds() / frequency

## Creating time serie

In [None]:
dates = pd.date_range(_t1, _t2, freq=f"{int(1000*frequency)}ms")

time = np.linspace(0, 1, dates.size)
freq1, freq2, offset1, offset2 = np.random.rand(4, 1)

df = pd.DataFrame(
    {
        "date": dates,
        "value": 0.5 * np.sin((time - offset1) * (freq1 * 10 + 10))
        + 0.2 * np.sin((time - offset2) * (freq2 * 20 + 20))
        + 0.1 * (np.random.rand(dates.size) - 0.5),
    }
)

In [None]:
df.shape

## Display time serie length and size

In [None]:
df.info()

Plotting the time serie, this is long (> 1 sec) and has no value since we can't see anything. No value in ploting 5 million points :-)

In [None]:
%%time
plt.plot(df["value"])
plt.ylabel("Value")
plt.xlabel("Date")
plt.title("Random Values")
plt.show()

In [None]:
df["date"] = pd.to_datetime(df["date"])

df.set_index("date", inplace=True)

## Down sample time serie 

Compute 3 curves instead of one, on this chart we clearly see that the mean has no sense for the end user since data is really spread in the [-6, 6] interval

In [None]:
nb_points = 5_000

resampling_rate = int(df.shape[0] / nb_points * frequency)

print(f"Resampling rate: {resampling_rate}")

df_resampler = df["value"].resample(f"{resampling_rate}S")

In [None]:
df_small = df_resampler.agg(["min", "max", "mean"])
df_small.shape

In [None]:
%%time
fig = go.Figure()

x = df_small.index.to_numpy(copy=True)
y_upper = df_small["max"].to_numpy(copy=True)
y_lower = df_small["min"].to_numpy(copy=True)

fig = go.Figure(
    [
        go.Scatter(
            name="value",
            x=df_small.index,
            y=df_small["mean"],
            mode="lines",
            line=dict(color="rgb(31, 119, 180)"),
        ),
        go.Scatter(
            name="Upper Bound",
            x=df_small.index,
            y=df_small["max"],
            mode="lines",
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False,
        ),
        go.Scatter(
            name="Lower Bound",
            x=df_small.index,
            y=df_small["min"],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode="lines",
            fillcolor="rgba(31, 119, 180, 0.3)",
            fill="tonexty",
            showlegend=False,
        ),
    ]
)
fig.update_layout(yaxis_title="Value", title="Bucketing", hovermode="x")

## Down sample time serie (larger number of points)

Compute 3 curves instead of one, this chart is a bit better but has no great value. Is the increase in render time worth it?

In [None]:
nb_points = 50_000

resampling_rate = int(df.shape[0] / nb_points * frequency)

print(f"Resampling rate: {resampling_rate}")

df_resampler = df["value"].resample(f"{resampling_rate}S")

In [None]:
df_medium = df_resampler.agg(["min", "max", "mean"])
df_medium.shape

In [None]:
%%time
fig = go.Figure()

x = df_medium.index.to_numpy(copy=True)
y_upper = df_medium["max"].to_numpy(copy=True)
y_lower = df_medium["min"].to_numpy(copy=True)

fig = go.Figure(
    [
        go.Scatter(
            name="value",
            x=df_medium.index,
            y=df_medium["mean"],
            mode="lines",
            line=dict(color="rgb(31, 119, 180)"),
        ),
        go.Scatter(
            name="Upper Bound",
            x=df_medium.index,
            y=df_medium["max"],
            mode="lines",
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False,
        ),
        go.Scatter(
            name="Lower Bound",
            x=df_medium.index,
            y=df_medium["min"],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode="lines",
            fillcolor="rgba(31, 119, 180, 0.3)",
            fill="tonexty",
            showlegend=False,
        ),
    ]
)
fig.update_layout(yaxis_title="Value", title="Bucketing", hovermode="x")

## Zooming in the time serie (and resampling to get approximately 5k points)

In [None]:
sub_df = df[0:100_000].copy()

In [None]:
nb_points = 5_000

resampling_rate = int(sub_df.shape[0] / nb_points * frequency)

print(f"Resampling rate: {resampling_rate}")

sub_df_resampler = sub_df["value"].resample(f"{resampling_rate}S")

In [None]:
df_small = sub_df_resampler.agg(["min", "max", "mean"])
df_small.shape

In [None]:
%%time
fig = go.Figure()

x = df_small.index.to_numpy(copy=True)
y_upper = df_small["max"].to_numpy(copy=True)
y_lower = df_small["min"].to_numpy(copy=True)

fig = go.Figure(
    [
        go.Scatter(
            name="value",
            x=df_small.index,
            y=df_small["mean"],
            mode="lines",
            line=dict(color="rgb(31, 119, 180)"),
        ),
        go.Scatter(
            name="Upper Bound",
            x=df_small.index,
            y=df_small["max"],
            mode="lines",
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False,
        ),
        go.Scatter(
            name="Lower Bound",
            x=df_small.index,
            y=df_small["min"],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode="lines",
            fillcolor="rgba(31, 119, 180, 0.3)",
            fill="tonexty",
            showlegend=False,
        ),
    ]
)
fig.update_layout(yaxis_title="Value", title="Bucketing", hovermode="x")

### Zooming more
Render time stays constant while details increase

In [None]:
sub_df = df[0:10_000].copy()

In [None]:
nb_points = 5_000

df_small = sub_df.copy()

In [None]:
%%time
if sub_df.shape[0] > nb_points:
    resampling_rate = int(sub_df.shape[0] / nb_points * frequency)

    print(f"Resampling rate: {resampling_rate}")

    sub_df_resampler = sub_df["value"].resample(f"{resampling_rate}S")

    df_small = sub_df_resampler.agg(["min", "max", "mean"])
    df_small.shape

In [None]:
%%time
fig = go.Figure()

x = df_small.index.to_numpy(copy=True)
y_upper = df_small["max"].to_numpy(copy=True)
y_lower = df_small["min"].to_numpy(copy=True)

fig = go.Figure(
    [
        go.Scatter(
            name="value",
            x=df_small.index,
            y=df_small["mean"],
            mode="lines",
            line=dict(color="rgb(31, 119, 180)"),
        ),
        go.Scatter(
            name="Upper Bound",
            x=df_small.index,
            y=df_small["max"],
            mode="lines",
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False,
        ),
        go.Scatter(
            name="Lower Bound",
            x=df_small.index,
            y=df_small["min"],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode="lines",
            fillcolor="rgba(31, 119, 180, 0.3)",
            fill="tonexty",
            showlegend=False,
        ),
    ]
)
fig.update_layout(yaxis_title="Value", title="Bucketing", hovermode="x")