# Filtering time series

In [4]:
from datetime import datetime,date,time

import polars as pl

In [5]:
csv_file = "data/nyc_trip_data_1k.csv"

In [6]:
df = pl.read_csv(csv_file,try_parse_dates=True)
df.head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66


## Filtering by datetime
Using python's built-in `datetime.datetime`, `datetime.date`, or `pl.lit`

In [7]:
df.filter(
    pl.col("pickup") < datetime(2022, 1, 1, 1, 1, 0)
)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66
"""id8""",2022-01-01 00:40:58,2022-01-01 01:00:59,4.0,8.44,25.5,0.0
"""id0""",2022-01-01 00:55:13,2022-01-01 01:25:49,1.0,12.61,37.5,12.39
"""id1""",2022-01-01 00:55:24,2022-01-01 01:00:45,1.0,1.49,6.5,0.0


In [10]:
df.filter(
    pl.col("pickup") < date(2022, 1, 2)
).head()

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66
"""id8""",2022-01-01 00:40:58,2022-01-01 01:00:59,4.0,8.44,25.5,0.0
"""id0""",2022-01-01 00:55:13,2022-01-01 01:25:49,1.0,12.61,37.5,12.39
"""id1""",2022-01-01 00:55:24,2022-01-01 01:00:45,1.0,1.49,6.5,0.0


In [12]:
df.filter(
    pl.col("pickup") < pl.lit("2022-01-02").str.strptime(pl.Date, format="%Y-%m-%d")
).tail()

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id5""",2022-01-01 22:19:54,2022-01-01 22:25:53,1.0,1.18,6.5,0.0
"""id5""",2022-01-01 22:21:22,2022-01-01 22:29:58,1.0,1.68,8.0,0.0
"""id8""",2022-01-01 22:25:08,2022-01-01 22:40:38,1.0,3.49,13.5,0.0
"""id6""",2022-01-01 23:42:30,2022-01-02 00:10:53,2.0,6.7,24.5,2.0
"""id4""",2022-01-01 23:59:02,2022-01-02 00:24:05,3.0,8.28,26.5,9.21


## Filtering on a datetime range

Using the `is_between` expression

In [14]:
df.filter(
    pl.col("pickup").is_between(
        datetime(2021, 12, 31), datetime(2022, 1, 2)
    )
).head()

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66
"""id8""",2022-01-01 00:40:58,2022-01-01 01:00:59,4.0,8.44,25.5,0.0
"""id0""",2022-01-01 00:55:13,2022-01-01 01:25:49,1.0,12.61,37.5,12.39
"""id1""",2022-01-01 00:55:24,2022-01-01 01:00:45,1.0,1.49,6.5,0.0


## Filtering datetime in lazy mode

In [15]:
print(
    pl.scan_csv(
        csv_file,
        try_parse_dates=True
    ).filter(
        pl.col("pickup") < date(2022, 1, 2)
    ).explain()
)

Csv SCAN [data/nyc_trip_data_1k.csv]
PROJECT */7 COLUMNS
SELECTION: [(col("pickup")) < (2022-01-02 00:00:00)]
ESTIMATED ROWS: 984


## Filtering on a duration

In [16]:
df.select(
    "pickup", "dropoff"
).with_columns(
    (pl.col("dropoff") - pl.col("pickup")).alias("duration")
).head()

pickup,dropoff,duration
datetime[μs],datetime[μs],duration[μs]
2022-01-01 00:04:14,2022-01-01 00:26:12,21m 58s
2022-01-01 00:32:17,2022-01-01 00:49:23,17m 6s
2022-01-01 00:40:58,2022-01-01 01:00:59,20m 1s
2022-01-01 00:55:13,2022-01-01 01:25:49,30m 36s
2022-01-01 00:55:24,2022-01-01 01:00:45,5m 21s


To filter on a duration we use `pl.duration`, this function is different from the dtype `pl.Duration`

In [17]:
df.select(
    "pickup", "dropoff"
).with_columns(
    (pl.col("dropoff") - pl.col("pickup")).alias("duration")
).filter(
    pl.col("duration") < pl.duration(minutes=10)
).head(3)

pickup,dropoff,duration
datetime[μs],datetime[μs],duration[μs]
2022-01-01 00:55:24,2022-01-01 01:00:45,5m 21s
2022-01-01 01:09:34,2022-01-01 01:13:46,4m 12s
2022-01-01 01:40:03,2022-01-01 01:48:18,8m 15s


`dt.total_minutes`, this method only supports for `Duration` type data

In [18]:
df.select(
    "pickup", "dropoff"
).with_columns(
    (pl.col("dropoff") - pl.col("pickup")).alias("duration")
).filter(
    pl.col("duration").dt.total_minutes() < 10
).head(3)

pickup,dropoff,duration
datetime[μs],datetime[μs],duration[μs]
2022-01-01 00:55:24,2022-01-01 01:00:45,5m 21s
2022-01-01 01:09:34,2022-01-01 01:13:46,4m 12s
2022-01-01 01:40:03,2022-01-01 01:48:18,8m 15s


## Exercises

### Exercise 1
Create a `DataFrame` with a daily interval that starts on 1st January 2020 and ends on 31st January 2020

In [19]:
start = date(2020,1,1)
stop = date(2020,1,31)

df = pl.DataFrame({
    "date": pl.date_range(start, stop, interval="1d", eager=True)
})

df

date
date
2020-01-01
2020-01-02
2020-01-03
2020-01-04
2020-01-05
…
2020-01-27
2020-01-28
2020-01-29
2020-01-30


Find all dates on or after 15th January

In [20]:
df.filter(
    pl.col("date") >= pl.date(2020, 1, 15)
)

date
date
2020-01-15
2020-01-16
2020-01-17
2020-01-18
2020-01-19
…
2020-01-27
2020-01-28
2020-01-29
2020-01-30


Find all dates between 15th and 20th January including the start date but excluding the end date. 

In [22]:
df.filter(
    pl.col("date").is_between(
        pl.date(2020, 1, 15), pl.date(2020, 1, 20), closed="left"
    )
)

date
date
2020-01-15
2020-01-16
2020-01-17
2020-01-18
2020-01-19


### Exercise 2
Read the NYC taxi dataset with automatic date parsing

In [23]:
pl.read_csv(
    csv_file,
    try_parse_dates=True
).head(3)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66
"""id8""",2022-01-01 00:40:58,2022-01-01 01:00:59,4.0,8.44,25.5,0.0


Filter to get all the records with a pickup after 10 PM.

Expand the following collapsed cell if you want a hint.

In [26]:
pl.read_csv(
    csv_file,
    try_parse_dates=True
).filter(
    pl.col("pickup").cast(pl.Time()) > pl.time(22, 0, 0)
).head()

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id6""",2022-01-01 22:18:41,2022-01-01 22:39:40,1.0,6.86,22.5,7.89
"""id5""",2022-01-01 22:19:54,2022-01-01 22:25:53,1.0,1.18,6.5,0.0
"""id5""",2022-01-01 22:21:22,2022-01-01 22:29:58,1.0,1.68,8.0,0.0
"""id8""",2022-01-01 22:25:08,2022-01-01 22:40:38,1.0,3.49,13.5,0.0
"""id6""",2022-01-01 23:42:30,2022-01-02 00:10:53,2.0,6.7,24.5,2.0


Add a column that calculates the difference in pickup time between successive rows called `pickup_delta`

In [27]:
pl.read_csv(
    csv_file,
    try_parse_dates=True
).with_columns(
    (pl.col("pickup").diff()).alias("pickup_delta")
).head(3)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount,pickup_delta
str,datetime[μs],datetime[μs],f64,f64,f64,f64,duration[μs]
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0,
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66,28m 3s
"""id8""",2022-01-01 00:40:58,2022-01-01 01:00:59,4.0,8.44,25.5,0.0,8m 41s


Filter to find all records that started less than 3 minutes after the previous pickup

In [None]:
pl.read_csv(csv_file, try_parse_dates=True).with_columns(
    (pl.col("pickup").diff()).alias("pickup_delta")
).filter(
    pl.col("pickup_delta")
    < pl.duration(minutes=3)  # the return value data type of diff is duration
)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount,pickup_delta
str,datetime[μs],datetime[μs],f64,f64,f64,f64,duration[μs]
"""id1""",2022-01-01 00:55:24,2022-01-01 01:00:45,1.0,1.49,6.5,0.0,11s
"""id6""",2022-01-01 01:40:03,2022-01-01 01:48:18,1.0,1.32,7.5,2.26,1m 1s
"""id1""",2022-01-01 03:04:44,2022-01-01 03:20:43,1.0,5.01,16.0,3.96,1m 38s
"""id7""",2022-01-01 11:43:03,2022-01-01 11:53:28,3.0,1.74,8.5,2.95,2m 30s
"""id1""",2022-01-01 11:44:23,2022-01-01 11:52:10,1.0,1.49,7.5,2.7,1m 20s
…,…,…,…,…,…,…,…
"""id7""",2022-01-14 14:44:45,2022-01-14 15:04:08,1.0,2.9,13.5,3.36,2m 6s
"""id6""",2022-01-14 17:15:23,2022-01-14 17:19:34,2.0,1.1,5.5,2.45,44s
"""id1""",2022-01-14 17:16:43,2022-01-14 17:24:05,5.0,1.62,7.5,2.36,1m 20s
"""id0""",2022-01-14 18:20:51,2022-01-14 18:34:09,1.0,1.07,9.0,0.0,1m
