# Adjusting datetimes

In [1]:
from datetime import date,datetime,timedelta

import polars as pl

In [None]:
start = datetime(2020, 1, 1)
stop = datetime(2020, 4, 1)

df = pl.DataFrame(
    {"datetime": pl.datetime_range(start, stop, interval="1mo", eager=True)}
)

df

datetime
datetime[μs]
2020-01-01 00:00:00
2020-02-01 00:00:00
2020-03-01 00:00:00
2020-04-01 00:00:00


Adjust a datetime using `pl.duration`, a `datetime.timedelta` or the `dt.offset_by` expression

In [4]:
df.with_columns(
    (pl.col("datetime") + pl.duration(hours=1, minutes=10)).alias("with_duration"),
    pl.col("datetime").dt.offset_by("1h10m").alias("with_offset_by"),
    (pl.col("datetime") + timedelta(hours=1, minutes=10)).alias("with_timedelta")
)

datetime,with_duration,with_offset_by,with_timedelta
datetime[μs],datetime[μs],datetime[μs],datetime[μs]
2020-01-01 00:00:00,2020-01-01 01:10:00,2020-01-01 01:10:00,2020-01-01 01:10:00
2020-02-01 00:00:00,2020-02-01 01:10:00,2020-02-01 01:10:00,2020-02-01 01:10:00
2020-03-01 00:00:00,2020-03-01 01:10:00,2020-03-01 01:10:00,2020-03-01 01:10:00
2020-04-01 00:00:00,2020-04-01 01:10:00,2020-04-01 01:10:00,2020-04-01 01:10:00


The main differences between `pl.duration` and `dt.offset_by`:
- `pl.duration` is a fixed amount of time so `pl.duration(days=1)` is 24 hours
- `dt.offset_by` works with the calender so it can be 23 or 25 hours depending on daylight savings


In [None]:
pl.DataFrame(
    {"datetime_before_clocks_change": [datetime(2020, 3, 29, 0)]}
).with_columns(
    pl.col("datetime_before_clocks_change").dt.convert_time_zone("Europe/London")
).with_columns(
    pl.col("datetime_before_clocks_change")
    .dt.offset_by("1d")
    .alias("offset_by_one_day"),

    pl.col("datetime_before_clocks_change")
    .dt.offset_by("24h")
    .alias("offset_by_24_hours"),

    (pl.col("datetime_before_clocks_change") + pl.duration(days=1)).alias(
        "duration_one_day"
    ),
    
    (pl.col("datetime_before_clocks_change") + pl.duration(hours=24)).alias(
        "duration_24_hours"
    ),
)

datetime_before_clocks_change,offset_by_one_day,offset_by_24_hours,duration_one_day,duration_24_hours
"datetime[μs, Europe/London]","datetime[μs, Europe/London]","datetime[μs, Europe/London]","datetime[μs, Europe/London]","datetime[μs, Europe/London]"
2020-03-29 00:00:00 GMT,2020-03-30 00:00:00 BST,2020-03-30 01:00:00 BST,2020-03-30 01:00:00 BST,2020-03-30 01:00:00 BST


The largest unit supported by `pl.duration` is weeks and the largest unit supported by `timedelta` is days and so they cannot, for example, move forward by a calendar month.

Instead, we can use `pl.offset_by`

In [6]:
df.with_columns(
    pl.col("datetime").dt.offset_by(by="1mo").alias("add_month")
)

datetime,add_month
datetime[μs],datetime[μs]
2020-01-01 00:00:00,2020-02-01 00:00:00
2020-02-01 00:00:00,2020-03-01 00:00:00
2020-03-01 00:00:00,2020-04-01 00:00:00
2020-04-01 00:00:00,2020-05-01 00:00:00


Move a datetime series to a month-end basis we use the `dt.month_end` expression, there is `dt.month_start` as well. 

In [8]:
df.with_columns(
    pl.col("datetime").dt.month_end().alias("month_end")
).with_columns(
    pl.col("month_end").dt.offset_by("1mo").alias("offset")
)

datetime,month_end,offset
datetime[μs],datetime[μs],datetime[μs]
2020-01-01 00:00:00,2020-01-31 00:00:00,2020-02-29 00:00:00
2020-02-01 00:00:00,2020-02-29 00:00:00,2020-03-29 00:00:00
2020-03-01 00:00:00,2020-03-31 00:00:00,2020-04-30 00:00:00
2020-04-01 00:00:00,2020-04-30 00:00:00,2020-05-30 00:00:00


`month_end` with `offset_by` in 1 month, it could not accurately to the last day of next month.

Hence, we have to call `month_end` again.

In [9]:
df.with_columns(
    pl.col("datetime").dt.month_end().alias("month_end")
).with_columns(
    pl.col("month_end").dt.offset_by("1mo").dt.month_end().alias("offset")
)

datetime,month_end,offset
datetime[μs],datetime[μs],datetime[μs]
2020-01-01 00:00:00,2020-01-31 00:00:00,2020-02-29 00:00:00
2020-02-01 00:00:00,2020-02-29 00:00:00,2020-03-31 00:00:00
2020-03-01 00:00:00,2020-03-31 00:00:00,2020-04-30 00:00:00
2020-04-01 00:00:00,2020-04-30 00:00:00,2020-05-31 00:00:00


## Binning datetime

In [11]:
start = datetime(2020,1,1)
stop = datetime(2020,1,1,1,30)

pl.DataFrame(
    {
        "datetime": pl.datetime_range(start, stop, interval="20m", eager=True)
    }
).with_columns(
    pl.col("datetime").dt.truncate(every="30m").alias("truncate")
)

datetime,truncate
datetime[μs],datetime[μs]
2020-01-01 00:00:00,2020-01-01 00:00:00
2020-01-01 00:20:00,2020-01-01 00:00:00
2020-01-01 00:40:00,2020-01-01 00:30:00
2020-01-01 01:00:00,2020-01-01 01:00:00
2020-01-01 01:20:00,2020-01-01 01:00:00


## Rounding datetime

`dt.round` rounds datetimes to the nearest boundary: down to the window start or up to the window end.

In [12]:
pl.DataFrame(
    {
        "datetime": pl.datetime_range(start, stop, interval="20m", eager=True)
    }
).with_columns(
    pl.col("datetime").dt.truncate(every="30m").alias("truncate"),
    pl.col("datetime").dt.round(every="30m").alias("round")
)

datetime,truncate,round
datetime[μs],datetime[μs],datetime[μs]
2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00
2020-01-01 00:20:00,2020-01-01 00:00:00,2020-01-01 00:30:00
2020-01-01 00:40:00,2020-01-01 00:30:00,2020-01-01 00:30:00
2020-01-01 01:00:00,2020-01-01 01:00:00,2020-01-01 01:00:00
2020-01-01 01:20:00,2020-01-01 01:00:00,2020-01-01 01:30:00


## Exercises

### Exercise 1
Use `truncate` to map the values in the `pickup` column to the start of weekly intervals.

Apply an `offset` to ensure that the first mapped datetime is `2021-12-31 00:00:00`

In [18]:
csv_file = "data/nyc_trip_data_1k.csv"

pl.read_csv(csv_file, try_parse_dates=True).with_columns(
    pl.col("pickup").dt.truncate("1w").dt.offset_by("4d")
).head()

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2021-12-31 00:00:00,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2021-12-31 00:00:00,2022-01-01 00:49:23,1.0,3.97,14.5,3.66
"""id8""",2021-12-31 00:00:00,2022-01-01 01:00:59,4.0,8.44,25.5,0.0
"""id0""",2021-12-31 00:00:00,2022-01-01 01:25:49,1.0,12.61,37.5,12.39
"""id1""",2021-12-31 00:00:00,2022-01-01 01:00:45,1.0,1.49,6.5,0.0


Map the values in the `pickup` column into weekly windows based on the closest window boundary using `round`

In [21]:
csv_file = "data/nyc_trip_data_1k.csv"

pl.read_csv(csv_file, try_parse_dates=True).with_columns(
    pl.col("pickup").dt.round("1w")
).head()

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-03 00:00:00,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-03 00:00:00,2022-01-01 00:49:23,1.0,3.97,14.5,3.66
"""id8""",2022-01-03 00:00:00,2022-01-01 01:00:59,4.0,8.44,25.5,0.0
"""id0""",2022-01-03 00:00:00,2022-01-01 01:25:49,1.0,12.61,37.5,12.39
"""id1""",2022-01-03 00:00:00,2022-01-01 01:00:45,1.0,1.49,6.5,0.0


### Exercise 2
Add 12 hours to each date so the datetimes are midday **on the last day of the month** instead of midnight

In [23]:
start = datetime(2020,1,1)
stop = datetime(2021,1,1)

pl.DataFrame(
        {
            "date":pl.datetime_range(start,stop,interval="1mo",eager=True)
        }
    ).with_columns(
        pl.col("date").dt.month_end().dt.offset_by("12h")
    )

date
datetime[μs]
2020-01-31 12:00:00
2020-02-29 12:00:00
2020-03-31 12:00:00
2020-04-30 12:00:00
2020-05-31 12:00:00
…
2020-09-30 12:00:00
2020-10-31 12:00:00
2020-11-30 12:00:00
2020-12-31 12:00:00
