## Introduction to datetime dtypes  

In [1]:
from datetime import date,datetime

import polars as pl
import pandas as pd

In [3]:
pl.Config.set_tbl_rows(4)

start = date(2026,1,1)
stop = date(2026,1,2)

df = pl.DataFrame(
    {
        'date':pl.datetime_range(
            start = start,
            end = stop,
            interval='1h',
            eager=True
        ),
    }
)
df

date
datetime[μs]
2026-01-01 00:00:00
2026-01-01 01:00:00
…
2026-01-01 23:00:00
2026-01-02 00:00:00


The dtype of this column is `datetime[μs]`. 

This means it has a `pl.Datetime` dtype where the underlying representation is microseconds since the start of the Unix epoch on 1st January 1970.

## Datetime dtypes

As well as `pl.Datetime`, Polars has `pl.Date`,`pl.Time` and `pl.Duration` dtypes.


| dtype|Example |Time unit |
---|---|---|
|`pl.Datetime` | 2026-01-27 22:30:17 |`Microseconds` since UNIX epoch |
|`pl.Date` |2022-03-18 |`Days` since UNIX epoch |
| `pl.Time` | 19:24:49 | `Nanoseconds` since midnight |
|`pl.Duration` |1d 5h |`Microseconds` |


In [None]:
start = datetime(2020,1,1)
stop = datetime(2020,1,2)
interval = "6h"

df_datetimes = (
    pl.DataFrame(
        {
            "datetime":pl.datetime_range(start,stop,interval=interval,eager=True)
        }
    ).with_columns(
        pl.col("datetime").cast(pl.Date).alias("date"),
        pl.col("datetime").cast(pl.Time).alias("time"),        
    )

)
df_datetimes.head(5)

datetime,date,time
datetime[μs],date,time
2020-01-01 00:00:00,2020-01-01,00:00:00
2020-01-01 06:00:00,2020-01-01,06:00:00
…,…,…
2020-01-01 18:00:00,2020-01-01,18:00:00
2020-01-02 00:00:00,2020-01-02,00:00:00


Get `pl.Duration`

In [6]:
df_datetimes = df_datetimes.with_columns(
    pl.col("datetime").diff().alias("duration")
)

df_datetimes

datetime,date,time,duration
datetime[μs],date,time,duration[μs]
2020-01-01 00:00:00,2020-01-01,00:00:00,
2020-01-01 06:00:00,2020-01-01,06:00:00,6h
…,…,…,…
2020-01-01 18:00:00,2020-01-01,18:00:00,6h
2020-01-02 00:00:00,2020-01-02,00:00:00,6h


### Integer representations

In [7]:
df_datetimes_physical = (
    df_datetimes
    .select(
        pl.col("datetime").to_physical().name.suffix("_us"),
        pl.col("date").to_physical().name.suffix("_days"),
        pl.col("duration").to_physical().name.suffix("_us"),
        pl.col("time").to_physical().name.suffix("_ns"),            
    )
)
df_datetimes_physical

datetime_us,date_days,duration_us,time_ns
i64,i32,i64,i64
1577836800000000,18262,,0
1577858400000000,18262,21600000000,21600000000000
…,…,…,…
1577901600000000,18262,21600000000,64800000000000
1577923200000000,18263,21600000000,0


### Changing the underlying time unit & conversion from Pandas/Numpy

In Polars a `pl.Datetime` is represented as `microseconds` by default. 

However, in Pandas and Numpy the underling representation is `nanoseconds`.

In [None]:
df_datetimes_pandas = pd.DataFrame(
    {
        "datetime": pd.date_range(
            start,
            stop,
            freq="6h",
        )
    }
)
df_datetimes_pandas.dtypes

datetime    datetime64[ns]
dtype: object

Covert Pandas `DataFrame` to Polars `DataFrame`, the nanoseconds is still here.

In [9]:
pl.from_pandas(
    df_datetimes_pandas
).head(2)

datetime
datetime[ns]
2020-01-01 00:00:00
2020-01-01 06:00:00


The nanosecond dtype will stop joining with the microsecond columns, hence, we have to cast it.

In [10]:
pl.from_pandas(
    df_datetimes_pandas
).with_columns(
    pl.col("datetime").dt.cast_time_unit("us")
).head(2)

datetime
datetime[μs]
2020-01-01 00:00:00
2020-01-01 06:00:00


### Timestamp

In [11]:
df_datetimes.select(
    pl.col("datetime"),
    pl.col("datetime").to_physical().alias("datetime_to_phys"),
    pl.col("datetime").dt.timestamp().alias("timestamp_us"),
    pl.col("datetime").dt.timestamp(time_unit="ns").alias("timestamp_ns"),
)

datetime,datetime_to_phys,timestamp_us,timestamp_ns
datetime[μs],i64,i64,i64
2020-01-01 00:00:00,1577836800000000,1577836800000000,1577836800000000000
2020-01-01 06:00:00,1577858400000000,1577858400000000,1577858400000000000
…,…,…,…
2020-01-01 18:00:00,1577901600000000,1577901600000000,1577901600000000000
2020-01-02 00:00:00,1577923200000000,1577923200000000,1577923200000000000


## Exercises
 
### Exercise 1
Create a `DataFrame` with a column called `datetime` that has datetimes from the start of 2020 to 30th June 2022 at 6-monthly intervals

In [None]:
start = datetime(2020,1,1)
stop = datetime(2022,6,30)

df = pl.DataFrame({
    "datetime": pl.datetime_range(
        start=start,
        end=stop,
        interval="6mo",
        eager=True
    )
})

df

datetime
datetime[μs]
2020-01-01 00:00:00
2020-07-01 00:00:00
…
2021-07-01 00:00:00
2022-01-01 00:00:00


Extend your query by copying your existing code in each subsequent part of this exercise.

Create this date range again but including the end date and excluding the start date

In [19]:
df = pl.DataFrame(
    {
        "datetime": pl.datetime_range(
            start=start, end=stop, interval="6mo", closed="right", eager=True
        )
    }
)

df

datetime
datetime[μs]
2020-07-01 00:00:00
2021-01-01 00:00:00
2021-07-01 00:00:00
2022-01-01 00:00:00


Add columns that encode the `datetime` column as a:
- date
- time

In [21]:
df = df.with_columns(
    pl.col("datetime").dt.date().alias("date"),
    pl.col("datetime").dt.time().alias("time")
)
df

datetime,date,time
datetime[μs],date,time
2020-07-01 00:00:00,2020-07-01,00:00:00
2021-01-01 00:00:00,2021-01-01,00:00:00
2021-07-01 00:00:00,2021-07-01,00:00:00
2022-01-01 00:00:00,2022-01-01,00:00:00


Add three new columns that have the physical representation for the `datetime`, `date` and `time` columns. 

Each new column name should end with `_physical`.

Challenge: do this as a single expression inside an additional `with_column`

In [24]:
df = df.with_columns(
    pl.all().to_physical().name.suffix("_physical")
)

df

datetime,date,time,datetime_physical,date_physical,time_physical
datetime[μs],date,time,i64,i32,i64
2020-07-01 00:00:00,2020-07-01,00:00:00,1593561600000000,18444,0
2021-01-01 00:00:00,2021-01-01,00:00:00,1609459200000000,18628,0
2021-07-01 00:00:00,2021-07-01,00:00:00,1625097600000000,18809,0
2022-01-01 00:00:00,2022-01-01,00:00:00,1640995200000000,18993,0


Add a new column that calculates the differences between the `datetime` column entries

In [25]:
df.with_columns(
    pl.col("datetime").diff().alias("duration")
)

datetime,date,time,datetime_physical,date_physical,time_physical,duration
datetime[μs],date,time,i64,i32,i64,duration[μs]
2020-07-01 00:00:00,2020-07-01,00:00:00,1593561600000000,18444,0,
2021-01-01 00:00:00,2021-01-01,00:00:00,1609459200000000,18628,0,184d
2021-07-01 00:00:00,2021-07-01,00:00:00,1625097600000000,18809,0,181d
2022-01-01 00:00:00,2022-01-01,00:00:00,1640995200000000,18993,0,184d
