## Extracting datetime components

In [1]:
from datetime import datetime

import polars as pl

In [2]:
csv_file = "data/nyc_trip_data_1k.csv"

In [3]:
df = pl.read_csv(csv_file,try_parse_dates=True)
df.head()

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66
"""id8""",2022-01-01 00:40:58,2022-01-01 01:00:59,4.0,8.44,25.5,0.0
"""id0""",2022-01-01 00:55:13,2022-01-01 01:25:49,1.0,12.61,37.5,12.39
"""id1""",2022-01-01 00:55:24,2022-01-01 01:00:45,1.0,1.49,6.5,0.0


## Extracting date and time

Extract date via casting `datetime` to `date`

In [4]:
df.with_columns(
    pl.col("pickup").cast(pl.Date)
).head(3)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,date,datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01,2022-01-01 00:49:23,1.0,3.97,14.5,3.66
"""id8""",2022-01-01,2022-01-01 01:00:59,4.0,8.44,25.5,0.0


In [5]:
df.select(
    "pickup"
).with_columns(
    pl.col("pickup").dt.date()
).head(3)

pickup
date
2022-01-01
2022-01-01
2022-01-01


Extract time via casting `datetime` to `time`

In [6]:
df.select(
    "pickup"
).with_columns(
    pl.col("pickup").cast(pl.Time).alias("cast_time"),
    pl.col("pickup").dt.time().alias("dt_time"),
).head(3)

pickup,cast_time,dt_time
datetime[μs],time,time
2022-01-01 00:04:14,00:04:14,00:04:14
2022-01-01 00:32:17,00:32:17,00:32:17
2022-01-01 00:40:58,00:40:58,00:40:58


Note that the `dt.date` and `dt.time` methods give a different result from `cast(pl.Date)` and `cast(pl.Time)` when a timezone is specified!

In [8]:
df.select(
    "pickup"
).with_columns(
    pl.col("pickup").dt.replace_time_zone("America/New_York").alias("local_datetime")
).with_columns(
    pl.col("pickup").dt.time().alias("pickup_date"),
    pl.col("local_datetime").dt.time().alias("local_datetime_date"),
    pl.col("local_datetime").cast(pl.Time()).alias("cast_local_datetime_date")
).head(1)

pickup,local_datetime,pickup_date,local_datetime_date,cast_local_datetime_date
datetime[μs],"datetime[μs, America/New_York]",time,time,time
2022-01-01 00:04:14,2022-01-01 00:04:14 EST,00:04:14,00:04:14,05:04:14


`cast(pl.time)` takes the time based on UTC timestamp

## Extracting datetime features

Use expressions in the `dt` namespace to extract date features

In [9]:
df.select(
    pl.col("pickup"),
    pl.col("pickup").dt.quarter().alias("quarter"),
    pl.col("pickup").dt.month().alias("month"),
    pl.col("pickup").dt.day().alias("day"),
    pl.col("pickup").dt.hour().alias("hour"),
    pl.col("pickup").dt.minute().alias("minute"),
    pl.col("pickup").dt.second().alias("second"),
    pl.col("pickup").dt.millisecond().alias("millisecond"),
    pl.col("pickup").dt.microsecond().alias("microsecond"),
    pl.col("pickup").dt.nanosecond().alias("nanosecond"),
).sample(5).sort("pickup")

pickup,quarter,month,day,hour,minute,second,millisecond,microsecond,nanosecond
datetime[μs],i8,i8,i8,i8,i8,i8,i32,i32,i32
2022-01-03 06:43:51,1,1,3,6,43,51,0,0,0
2022-01-04 14:27:23,1,1,4,14,27,23,0,0,0
2022-01-06 11:44:23,1,1,6,11,44,23,0,0,0
2022-01-08 18:35:17,1,1,8,18,35,17,0,0,0
2022-01-11 07:17:09,1,1,11,7,17,9,0,0,0


There are `year` and `iso_year` about the year feature. 

- The `year` is the literal year from the `calendar year`
- The `iso_year` is the year according to the ISO definition which is based on `52 full weeks` for a year

For datetime in the first few days of a year these values may be different

In [11]:
df.select(
    pl.col("pickup"),
    pl.col("pickup").dt.year().alias("year"),
    pl.col("pickup").dt.iso_year().alias("iso_year"),
).sort(
    "pickup"
).head(3)

pickup,year,iso_year
datetime[μs],i32,i32
2022-01-01 00:04:14,2022,2021
2022-01-01 00:32:17,2022,2021
2022-01-01 00:40:58,2022,2021


## Ordinal week and day numbers

- `.dt.week` gives the <a href="https://en.wikipedia.org/wiki/ISO_week_date" target="_blank">ISO week of the year</a>
- `.dt.weekday` gives the day of week where monday = 0 and sunday = 6
- `.dt.day` gives the day of month from 1 to 31
- `.dt.ordinal_day` gives the day of year from 1 to 365 or 366

In [12]:
df.select(
    pl.col("pickup"),
    pl.col("pickup").dt.week().alias("week"),
    pl.col("pickup").dt.weekday().alias("weekday"),
    pl.col("pickup").dt.day().alias("day_of_month"),
    pl.col("pickup").dt.ordinal_day().alias("ordinal_day"),
).head(2).sort("pickup")

pickup,week,weekday,day_of_month,ordinal_day
datetime[μs],i8,i8,i8,i16
2022-01-01 00:04:14,52,6,1,1
2022-01-01 00:32:17,52,6,1,1


## Extracting datetime components in lazy mode

In [13]:
print(
    pl.scan_csv(csv_file,try_parse_dates=True)
    .select(
        pl.col("pickup"),
        pl.col("pickup").dt.week().alias("week"),
        pl.col("pickup").dt.weekday().alias("weekday"),
        pl.col("pickup").dt.day().alias("day_of_month"),
        pl.col("pickup").dt.ordinal_day().alias("ordinal_day"),
    )
    .explain()
)

SELECT [col("pickup"), col("pickup").dt.week().alias("week"), col("pickup").dt.weekday().alias("weekday"), col("pickup").dt.day().alias("day_of_month"), col("pickup").dt.ordinal_day().alias("ordinal_day")]
  Csv SCAN [data/nyc_trip_data_1k.csv]
  PROJECT 1/7 COLUMNS
  ESTIMATED ROWS: 984


## Exercises

### Exercise 1
Count the number of records for each date (by pickup)

In [16]:
pl.read_csv(csv_file, try_parse_dates=True).group_by(
    pl.col("pickup").dt.date()
).len().sort("pickup")

pickup,len
date,u32
2022-01-01,68
2022-01-02,63
2022-01-03,86
2022-01-04,80
2022-01-05,77
…,…
2022-01-11,73
2022-01-12,73
2022-01-13,69
2022-01-14,58


### Exercise 2

Add a `day_of_year` column to get the number of records per ordinal day of the year

In [17]:
pl.read_csv(csv_file, try_parse_dates=True).with_columns(
    pl.col("pickup").dt.ordinal_day().alias("day_of_year")
)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount,day_of_year
str,datetime[μs],datetime[μs],f64,f64,f64,f64,i16
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0,1
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66,1
"""id8""",2022-01-01 00:40:58,2022-01-01 01:00:59,4.0,8.44,25.5,0.0,1
"""id0""",2022-01-01 00:55:13,2022-01-01 01:25:49,1.0,12.61,37.5,12.39,1
"""id1""",2022-01-01 00:55:24,2022-01-01 01:00:45,1.0,1.49,6.5,0.0,1
…,…,…,…,…,…,…,…
"""id0""",2022-01-14 18:20:51,2022-01-14 18:34:09,1.0,1.07,9.0,0.0,14
"""id4""",2022-01-14 18:32:26,2022-01-14 18:50:36,2.0,5.57,18.0,5.58,14
"""id2""",2022-01-14 18:34:11,2022-01-14 18:39:18,3.0,0.92,5.5,2.45,14
"""id0""",2022-01-14 18:49:08,2022-01-14 18:54:08,0.0,0.8,5.0,2.3,14


Continue by counting how many records there are for each day-of-year

In [21]:
pl.read_csv(csv_file, try_parse_dates=True).with_columns(
    pl.col("pickup").dt.ordinal_day().alias("day_of_year")
)["day_of_year"].value_counts().sort("day_of_year")

day_of_year,count
i16,u32
1,68
2,63
3,86
4,80
5,77
…,…
11,73
12,73
13,69
14,58


Add columns with the day-of-week and hour of the day based on the pickup time

In [22]:
pl.read_csv(csv_file, try_parse_dates=True).select(
    "pickup"
).with_columns(
    pl.col("pickup").dt.weekday().alias("day_of_week"),
    pl.col("pickup").dt.hour().alias("hour")
).head(3)

pickup,day_of_week,hour
datetime[μs],i8,i8
2022-01-01 00:04:14,6,0
2022-01-01 00:32:17,6,0
2022-01-01 00:40:58,6,0


Continue by counting the number of records for each (day-of-week,hour-of-the-day) pair.

Sort the output from largest number of records to smallest

In [27]:
pl.read_csv(csv_file, try_parse_dates=True).select(
    "pickup"
).with_columns(
    pl.col("pickup").dt.weekday().alias("day_of_week"),
    pl.col("pickup").dt.hour().alias("hour")
).group_by(
    ["day_of_week", "hour"]
).len().sort("len", descending=True)

day_of_week,hour,len
i8,i8,u32
1,15,18
4,15,17
5,13,15
2,8,15
3,8,15
…,…,…
3,3,1
1,22,1
5,0,1
3,1,1


Do the count of records by (day-of-week,hour-of-the-day) again, but this time extract the day-of-week & hour-of-the-day **inside the `groupby`**

In [28]:
pl.read_csv(csv_file, try_parse_dates=True).select("pickup").group_by(
    pl.col("pickup").dt.weekday().alias("day_of_week"),
    pl.col("pickup").dt.hour().alias("hour"),
).len().sort("len", descending=True)

day_of_week,hour,len
i8,i8,u32
1,15,18
4,15,17
2,8,15
3,8,15
5,13,15
…,…,…
4,5,1
4,2,1
2,0,1
1,3,1


Do the same operation but this time in lazy mode

In [30]:
print(pl.scan_csv(csv_file, try_parse_dates=True).select("pickup").group_by(
    pl.col("pickup").dt.weekday().alias("day_of_week"),
    pl.col("pickup").dt.hour().alias("hour"),
).len().sort("len", descending=True).explain())

SORT BY [descending: [true]] [col("len")]
  AGGREGATE[maintain_order: false]
    [len()] BY [col("pickup").dt.weekday().alias("day_of_week"), col("pickup").dt.hour().alias("hour")]
    FROM
    Csv SCAN [data/nyc_trip_data_1k.csv]
    PROJECT 1/7 COLUMNS
    ESTIMATED ROWS: 984
