# The `group_by_dynamic` window

In [1]:
from datetime import datetime

import polars as pl

In [2]:
start = datetime(2022,1,1)
stop = datetime(2022,1,2)

df = (
    pl.DataFrame(
        {
            'date':pl.datetime_range(start,stop,interval='1h',eager=True),
        }
    )
    .with_row_index("index")
)
df.head()

index,date
u32,datetime[μs]
0,2022-01-01 00:00:00
1,2022-01-01 01:00:00
2,2022-01-01 02:00:00
3,2022-01-01 03:00:00
4,2022-01-01 04:00:00


In [3]:
df["date"].flags

{'SORTED_ASC': True, 'SORTED_DESC': False}

### Specifying the window with `group_by_dynamic`

- `every`: how often a window starts
- `period`: how long a window lasts
- `offset`: when does the first window start

The first window starts at midnight default.

In [7]:
df.group_by_dynamic(
    "date",
    every="2h",
    include_boundaries=True # it makes parallelism more difficult
).agg(
    pl.col("index").count()
).head()

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],u32
2022-01-01 00:00:00,2022-01-01 02:00:00,2022-01-01 00:00:00,2
2022-01-01 02:00:00,2022-01-01 04:00:00,2022-01-01 02:00:00,2
2022-01-01 04:00:00,2022-01-01 06:00:00,2022-01-01 04:00:00,2
2022-01-01 06:00:00,2022-01-01 08:00:00,2022-01-01 06:00:00,2
2022-01-01 08:00:00,2022-01-01 10:00:00,2022-01-01 08:00:00,2


In [None]:
df.group_by_dynamic(
    "date",
    every="2h", # window starts
    period="4h", # window sustains
    include_boundaries=True
).agg(
    pl.col("index").count()
).head()

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],u32
2022-01-01 00:00:00,2022-01-01 04:00:00,2022-01-01 00:00:00,4
2022-01-01 02:00:00,2022-01-01 06:00:00,2022-01-01 02:00:00,4
2022-01-01 04:00:00,2022-01-01 08:00:00,2022-01-01 04:00:00,4
2022-01-01 06:00:00,2022-01-01 10:00:00,2022-01-01 06:00:00,4
2022-01-01 08:00:00,2022-01-01 12:00:00,2022-01-01 08:00:00,4


In [None]:
df.group_by_dynamic(
    "date",
    every="2h", # window starts
    period="4h", # window sustains
    offset="1h", # the first window starts at
    include_boundaries=True
).agg(
    pl.col("index").count()
).head()

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],u32
2021-12-31 23:00:00,2022-01-01 03:00:00,2021-12-31 23:00:00,3
2022-01-01 01:00:00,2022-01-01 05:00:00,2022-01-01 01:00:00,4
2022-01-01 03:00:00,2022-01-01 07:00:00,2022-01-01 03:00:00,4
2022-01-01 05:00:00,2022-01-01 09:00:00,2022-01-01 05:00:00,4
2022-01-01 07:00:00,2022-01-01 11:00:00,2022-01-01 07:00:00,4


The `offset` parameter only works with values less than or equal to `every` as it slides the starting position *within* a window. 

So if `every="2h"` passing `offset="3h"` is equivalent to passing `offset="1h"`. 

In [None]:
df.filter(
    pl.col("date") >= datetime(2022, 1, 1, 1)
).group_by_dynamic(
    "date", 
    every="2h", 
    period="4h", 
    offset="1h", 
    include_boundaries=True
).agg(
    pl.col("index").count()
).head(3)

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],u32
2022-01-01 01:00:00,2022-01-01 05:00:00,2022-01-01 01:00:00,4
2022-01-01 03:00:00,2022-01-01 07:00:00,2022-01-01 03:00:00,4
2022-01-01 05:00:00,2022-01-01 09:00:00,2022-01-01 05:00:00,4


`offset` can be negative

In [12]:
df.group_by_dynamic(
    "date", 
    every="2h", 
    period="4h", 
    offset="-10m", 
    include_boundaries=True
).agg(
    pl.col("index").count()
).head(3)

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],u32
2021-12-31 23:50:00,2022-01-01 03:50:00,2021-12-31 23:50:00,4
2022-01-01 01:50:00,2022-01-01 05:50:00,2022-01-01 01:50:00,4
2022-01-01 03:50:00,2022-01-01 07:50:00,2022-01-01 03:50:00,4


### Closure and boundaries of windows
By default the windows are closed on the `left` - datetimes on the left boundary are included while datetimes on the right boundary are not included

In [13]:
df.group_by_dynamic(
    "date",
    every="2h",
    include_boundaries=True
).agg(
    pl.col("index")
).head()

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],list[u32]
2022-01-01 00:00:00,2022-01-01 02:00:00,2022-01-01 00:00:00,"[0, 1]"
2022-01-01 02:00:00,2022-01-01 04:00:00,2022-01-01 02:00:00,"[2, 3]"
2022-01-01 04:00:00,2022-01-01 06:00:00,2022-01-01 04:00:00,"[4, 5]"
2022-01-01 06:00:00,2022-01-01 08:00:00,2022-01-01 06:00:00,"[6, 7]"
2022-01-01 08:00:00,2022-01-01 10:00:00,2022-01-01 08:00:00,"[8, 9]"


Includes right boundary

In [14]:
df.group_by_dynamic(
    "date",
    every="2h",
    include_boundaries=True,
    closed="both"
).agg(
    pl.col("index")
).head()

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],list[u32]
2022-01-01 00:00:00,2022-01-01 02:00:00,2022-01-01 00:00:00,"[0, 1, 2]"
2022-01-01 02:00:00,2022-01-01 04:00:00,2022-01-01 02:00:00,"[2, 3, 4]"
2022-01-01 04:00:00,2022-01-01 06:00:00,2022-01-01 04:00:00,"[4, 5, 6]"
2022-01-01 06:00:00,2022-01-01 08:00:00,2022-01-01 06:00:00,"[6, 7, 8]"
2022-01-01 08:00:00,2022-01-01 10:00:00,2022-01-01 08:00:00,"[8, 9, 10]"


## Setting the window boundaries

In [15]:
df.group_by_dynamic(
    "date",
    every="55m"
).agg(
    pl.col("index")
).head()

date,index
datetime[μs],list[u32]
2021-12-31 23:45:00,[0]
2022-01-01 00:40:00,[1]
2022-01-01 01:35:00,[2]
2022-01-01 02:30:00,[3]
2022-01-01 03:25:00,[4]


Set the first window time to 00:00:00

In [16]:
df.group_by_dynamic(
    "date",
    every="55m",
    offset="15m"
).agg(
    pl.col("index")
).head()

date,index
datetime[μs],list[u32]
2022-01-01 00:00:00,[0]
2022-01-01 00:55:00,[1]
2022-01-01 01:50:00,[2]
2022-01-01 02:45:00,[3]
2022-01-01 03:40:00,[4]


## Controlling the displayed datetime

In [17]:
df.group_by_dynamic(
    "date",
    every="55m",
    include_boundaries=True
).agg(
    pl.col("index")
).head(1)

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],list[u32]
2021-12-31 23:45:00,2022-01-01 00:40:00,2021-12-31 23:45:00,[0]


Use `label` argument to control what datetime value is used to label the window.

- `label = "left"` uses the lower bound of the window
- `label = "right"` uses the upper bound of the window
- `label = "datapoint"` uses the first datapoint in the window

In [19]:
df.group_by_dynamic(
    "date",
    every="55m",
    include_boundaries=True,
    label="right"
).agg(
    pl.col("index")
).head(1)

_lower_boundary,_upper_boundary,date,index
datetime[μs],datetime[μs],datetime[μs],list[u32]
2021-12-31 23:45:00,2022-01-01 00:40:00,2022-01-01 00:40:00,[0]


## Exercises

### Exercise 1
Create a `DataFrame` that runs over 2020 at 2 minute intervals. 

Add a column for the row count

In [21]:
start = datetime(2020, 1, 1, 0, 0, 0)
stop = datetime(2020, 12, 31 ,23, 59, 59)

pl.DataFrame(
    {
        "date": pl.datetime_range(
            start,
            stop,
            interval="2m",
            eager=True
        )
    }
).with_row_index().head(5)

index,date
u32,datetime[μs]
0,2020-01-01 00:00:00
1,2020-01-01 00:02:00
2,2020-01-01 00:04:00
3,2020-01-01 00:06:00
4,2020-01-01 00:08:00


Do a dynamic groupby with windows that start every hour and last one hour. 

Aggregate the `index` column into the list of row indices for each window

In [24]:
pl.DataFrame(
    {
        "date": pl.datetime_range(
            start,
            stop,
            interval="2m",
            eager=True
        )
    }
).with_row_index().group_by_dynamic(
    "date",
    every="1h",
).agg(
    pl.col("index")
).head()

date,index
datetime[μs],list[u32]
2020-01-01 00:00:00,"[0, 1, … 29]"
2020-01-01 01:00:00,"[30, 31, … 59]"
2020-01-01 02:00:00,"[60, 61, … 89]"
2020-01-01 03:00:00,"[90, 91, … 119]"
2020-01-01 04:00:00,"[120, 121, … 149]"


Do a dynamic groupby again with windows that start every hour and last two hours. 

In [25]:
pl.DataFrame(
    {
        "date": pl.datetime_range(
            start,
            stop,
            interval="2m",
            eager=True
        )
    }
).with_row_index().group_by_dynamic(
    "date",
    every="1h",
    period="2h"
).agg(
    pl.col("index")
).head()

date,index
datetime[μs],list[u32]
2020-01-01 00:00:00,"[0, 1, … 59]"
2020-01-01 01:00:00,"[30, 31, … 89]"
2020-01-01 02:00:00,"[60, 61, … 119]"
2020-01-01 03:00:00,"[90, 91, … 149]"
2020-01-01 04:00:00,"[120, 121, … 179]"


Offset the start of the first window to 30 minutes *before* midnight

In [27]:
pl.DataFrame(
    {
        "date": pl.datetime_range(
            start,
            stop,
            interval="2m",
            eager=True
        )
    }
).with_row_index().group_by_dynamic(
    "date",
    every="1h",
    period="2h",
    offset="-30m"
).agg(
    pl.col("index")
).head()

date,index
datetime[μs],list[u32]
2019-12-31 23:30:00,"[0, 1, … 44]"
2020-01-01 00:30:00,"[15, 16, … 74]"
2020-01-01 01:30:00,"[45, 46, … 104]"
2020-01-01 02:30:00,"[75, 76, … 134]"
2020-01-01 03:30:00,"[105, 106, … 164]"


Adapt the earlier steps to:
- create the `DataFrame` over 2020 again but this time at **7 minute intervals**
- add a row count columns
- do a groupby with one-hour windows
- set the displayed date for each window to be the first datapoint in the window

In [30]:
pl.DataFrame(
    {
        "date": pl.datetime_range(
            start,
            stop,
            interval="7m",
            eager=True
        )
    }
).with_row_index().group_by_dynamic(
    "date",
    every="1h",
    label="datapoint"
).agg(
    pl.col("index")
).head()

date,index
datetime[μs],list[u32]
2020-01-01 00:00:00,"[0, 1, … 8]"
2020-01-01 01:03:00,"[9, 10, … 17]"
2020-01-01 02:06:00,"[18, 19, … 25]"
2020-01-01 03:02:00,"[26, 27, … 34]"
2020-01-01 04:05:00,"[35, 36, … 42]"


Set the windows of this `DataFrame` to be closed on the right

In [31]:
pl.DataFrame(
    {
        "date": pl.datetime_range(
            start,
            stop,
            interval="7m",
            eager=True
        )
    }
).with_row_index().group_by_dynamic(
    "date",
    every="1h",
    label="datapoint",
    closed="right"
).agg(
    pl.col("index")
).head()

date,index
datetime[μs],list[u32]
2020-01-01 00:00:00,[0]
2020-01-01 00:07:00,"[1, 2, … 8]"
2020-01-01 01:03:00,"[9, 10, … 17]"
2020-01-01 02:06:00,"[18, 19, … 25]"
2020-01-01 03:02:00,"[26, 27, … 34]"


### Exercise 2
Create the query to generate the following optimized plan with a groupby window that is one week long

Note that the `group_by_dynamic` arguments do not appear in the optimized plan

```python
SORT BY [col("mean")]
  AGGREGATE
  	[col("trip_distance").count().alias("count"), col("trip_distance").mean().alias("mean"), col("trip_distance").max().alias("max")] BY [] FROM
     WITH_COLUMNS:
     [col("pickup").set_sorted()]

        Csv SCAN ../data/nyc_trip_data_1k.csv
        PROJECT 2/7 COLUMNS
```

In [34]:
csv_file = "data/nyc_trip_data_1k.csv"
print(
    pl.scan_csv(csv_file, try_parse_dates=True).group_by_dynamic(
        pl.col("pickup").set_sorted(),
        every="1d"
    ).agg(
        pl.col("trip_distance").count().alias("count"),
        pl.col("trip_distance").mean().alias("mean"),
        pl.col("trip_distance").max().alias("max"),
    ).sort("mean").explain()
)    

SORT BY [col("mean")]
  AGGREGATE[maintain_order: false]
    [col("trip_distance").count().alias("count"), col("trip_distance").mean().alias("mean"), col("trip_distance").max().alias("max")] BY []
    FROM
     WITH_COLUMNS:
     [col("pickup").set_sorted()] 
      Csv SCAN [data/nyc_trip_data_1k.csv]
      PROJECT 2/7 COLUMNS
      ESTIMATED ROWS: 984


Evaluate the full query and inspect the data. 

Modify the query so the first date is 2022-01-01 00:00:00.

You will need to `collect()` the query to view the data to for the second point.

In [45]:
pl.scan_csv(csv_file, try_parse_dates=True).group_by_dynamic(
        pl.col("pickup").set_sorted(),
        every="1w",
        period="5d",
        offset="-2d"
    ).agg(
        pl.col("trip_distance").count().alias("count"),
        pl.col("trip_distance").mean().alias("mean"),
        pl.col("trip_distance").max().alias("max"),
    ).collect()

pickup,count,mean,max
datetime[μs],u32,f64,f64
2022-01-01 00:00:00,374,3.712781,24.75
2022-01-08 00:00:00,364,3.516896,70.78
2022-01-29 00:00:00,1,2.62,2.62
