In [1]:
import polars as pl

In [2]:
measurements = pl.read_parquet("data/measurements.parquet")
measurements

name,timestamp,blood_pressure,heart_rate,temperature,blood_glucose
str,datetime[μs],i16,i16,f32,i16
"""Arctic Archie""",2020-04-20 00:09:05.915752,126,66,35.860001,155
"""Chilly Willy""",2020-04-20 00:03:09.240004,72,62,35.959999,115
"""Blizzard Bob""",2020-04-20 00:00:21.265568,126,56,36.07,192
"""Arctic Archie""",2020-04-20 01:03:58.174774,83,74,35.939999,154
"""Chilly Willy""",2020-04-20 01:05:57.794002,71,72,35.66,112
…,…,…,…,…,…
"""Peter Panda""",2024-04-18 23:02:54.446361,137,71,38.52,204
"""Arctic Archie""",2024-04-18 23:03:41.806258,108,64,36.290001,147
"""Chilly Willy""",2024-04-18 23:01:00.699120,93,66,36.360001,116
"""Cubby Coldpaws""",2024-04-18 23:01:57.459280,86,132,35.75,152


In [3]:
batch_measurements = pl.read_parquet("data/batch_measurements.parquet")
batch_measurements

name,vet,age,weight,daily_steps,timestamp,vet_health_check,life_stage
str,i8,i8,f32,i16,datetime[μs],str,str
"""ARCTIC ARCHIE""",5,25,602.579834,16006,2020-04-20 06:15:00,"""INJURED""","""SENIOR"""
"""Chilly Willy""",1,5,500.916779,9386,2020-04-20 06:15:00,"""HEALTHY""","""ADULT"""
"""Blizzard Bob""",4,35,858.577148,12950,2020-04-20 06:15:00,"""HEALTHY""","""SENIOR"""
"""ARCTIC ARCHIE""",5,25,606.220032,3162,2020-04-22 06:15:00,"""SICK""","""SENIOR"""
"""Chilly Willy""",4,5,507.499573,2709,2020-04-22 06:15:00,"""INJURED""","""ADULT"""
…,…,…,…,…,…,…,…
"""Peter Panda""",3,2,308.347321,3269,2024-04-17 06:15:00,"""SICK""","""JUV"""
"""ARCTIC ARCHIE""",5,29,605.241638,7530,2024-04-17 06:15:00,"""INJURED""","""SENIOR"""
"""Chilly Willy""",3,9,501.613251,11964,2024-04-17 06:15:00,"""SICK""","""ADULT"""
"""Cubby Coldpaws""",4,0,2.410815,8776,2024-04-17 06:15:00,"""INJURED""","""CUB"""


## Basic Transforms: Filter, project, Union

- Was there in 2022 an injured polar bear older than 15 (i.e. a senior polar bear)?

In [4]:
batch_measurements.filter(pl.col("timestamp").dt.year() == 2022, pl.col("age") > 15, pl.col("vet_health_check") == "INJURED")

name,vet,age,weight,daily_steps,timestamp,vet_health_check,life_stage
str,i8,i8,f32,i16,datetime[μs],str,str
"""BLIZZARD BOB""",5,37,849.126404,1706,2022-01-04 06:15:00,"""INJURED""","""SENIOR"""
"""Blizzard Bob""",2,37,858.342957,13218,2022-01-08 06:15:00,"""INJURED""","""SENIOR"""
"""Blizzard Bob""",3,37,867.172668,17313,2022-01-12 06:15:00,"""INJURED""","""SENIOR"""
"""ARCTIC ARCHIE""",5,27,602.737976,15833,2022-01-14 06:15:00,"""INJURED""","""SENIOR"""
"""Blizzard Bob""",5,37,851.439575,10281,2022-01-14 06:15:00,"""INJURED""","""SENIOR"""
…,…,…,…,…,…,…,…
"""BLIZZARD BOB""",5,38,850.709473,13550,2022-12-24 06:15:00,"""INJURED""","""SENIOR"""
"""Arctic Archie""",3,28,615.534912,10807,2022-12-26 06:15:00,"""INJURED""","""SENIOR"""
"""Blizzard Bob""",1,38,844.144226,14087,2022-12-26 06:15:00,"""INJURED""","""SENIOR"""
"""Arctic Archie""",2,28,586.479248,12090,2022-12-28 06:15:00,"""INJURED""","""SENIOR"""


- How many times was Blizzard Bob's name capitalized in the batch measurements?

In [5]:
batch_measurements.filter(name="BLIZZARD BOB").select(pl.len()).item(), batch_measurements.filter(name="Blizzard Bob").select(pl.len()).item()

(127, 603)

 - Was Cubby Coldpaw ever sick with a temperature above 40 degrees? (tip: union + downfill)

In [6]:
filter = pl.col("name").str.contains("(?i)Cubby Coldpaws")
pl.concat(
    [measurements.filter(filter), batch_measurements.filter(filter)],
    how="diagonal_relaxed",
).sort(by="timestamp").fill_null(strategy="forward").filter(
    pl.col("vet_health_check") == "SICK", pl.col("temperature") > 39
)

name,timestamp,blood_pressure,heart_rate,temperature,blood_glucose,vet,age,weight,daily_steps,vet_health_check,life_stage
str,datetime[μs],i16,i16,f32,i16,i8,i8,f32,i16,str,str
"""Cubby Coldpaws""",2024-03-28 12:08:44.258966,104,130,39.02,173,3,0,1.834584,14419,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-03-28 13:08:03.580084,122,136,39.209999,166,3,0,1.834584,14419,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-03-29 11:09:18.976513,116,145,39.040001,164,3,0,1.834584,14419,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-07 12:00:56.409269,128,132,39.27,169,3,0,2.103115,3277,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-07 14:01:18.140735,73,134,39.099998,177,3,0,2.103115,3277,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-08 12:07:49.954839,86,142,39.549999,175,3,0,2.103115,3277,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-10 10:07:59.513127,136,134,39.18,169,3,0,2.161351,9795,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-10 11:09:08.529234,119,138,39.279999,168,3,0,2.161351,9795,"""SICK""","""CUB"""


# Windowing and Aggregations

- For every year, figure out which polar bear was the heaviest

In [7]:
batch_measurements.select("name", "timestamp", "weight").filter(
    pl.col("weight") == pl.col("weight").max().over(pl.col("timestamp").dt.year())
)

name,timestamp,weight
str,datetime[μs],f32
"""Blizzard Bob""",2020-09-19 06:15:00,868.621704
"""Blizzard Bob""",2021-06-10 06:15:00,877.176575
"""Blizzard Bob""",2022-10-25 06:15:00,875.102356
"""Blizzard Bob""",2023-12-01 06:15:00,876.387939
"""Blizzard Bob""",2024-01-28 06:15:00,877.227234


- When was the first and last measurement of each bear taken?

In [8]:
measurements.group_by("name").agg(
    pl.col("timestamp").first().alias("first measurement"),
    pl.col("timestamp").last().alias("last measurement"),
)

name,first measurement,last measurement
str,datetime[μs],datetime[μs]
"""Arctic Archie""",2020-04-20 00:09:05.915752,2024-04-18 23:03:41.806258
"""Cubby Coldpaws""",2024-03-24 00:09:33.778516,2024-04-18 23:01:57.459280
"""Icy Ingrid""",2020-06-12 00:08:54.249726,2024-04-18 23:07:34.383085
"""Chilly Willy""",2020-04-20 00:03:09.240004,2024-04-18 23:01:00.699120
"""Peter Panda""",2022-03-10 00:00:10.943873,2024-04-18 23:02:54.446361
"""Blizzard Bob""",2020-04-20 00:00:21.265568,2024-04-18 23:06:15.533603


- For each lifestage group of polar bears and for each year, which polar bear was the most active (most amount of steps per day)?

In [9]:
batch_measurements.filter(
    pl.col("daily_steps")
    == pl.col("daily_steps").max().over(pl.col("timestamp").dt.year(), "life_stage")
)

name,vet,age,weight,daily_steps,timestamp,vet_health_check,life_stage
str,i8,i8,f32,i16,datetime[μs],str,str
"""Icy Ingrid""",1,0,1.611996,19806,2020-08-14 06:15:00,"""SICK""","""CUB"""
"""CHILLY WILLY""",5,5,511.081604,19757,2020-08-22 06:15:00,"""SICK""","""ADULT"""
"""Arctic Archie""",4,26,611.615601,19997,2020-10-07 06:15:00,"""SICK""","""SENIOR"""
"""Blizzard Bob""",1,36,862.02594,19990,2021-03-22 06:15:00,"""SICK""","""SENIOR"""
"""Icy Ingrid""",1,0,29.695402,19828,2021-04-03 06:15:00,"""HEALTHY""","""CUB"""
…,…,…,…,…,…,…,…
"""Peter Panda""",4,1,266.913116,19874,2023-10-04 06:15:00,"""SICK""","""JUV"""
"""Blizzard Bob""",3,39,861.203125,19731,2024-02-01 06:15:00,"""SICK""","""SENIOR"""
"""Icy Ingrid""",2,3,288.60144,19932,2024-03-26 06:15:00,"""HEALTHY""","""JUV"""
"""Chilly Willy""",2,9,499.945343,19967,2024-03-26 06:15:00,"""HEALTHY""","""ADULT"""


- Find out which bears were more/less anxious (higher/lower blood pressure) than average after New Year's Eve (fireworks)?

In [10]:
measurements.with_columns(
    (pl.col("blood_pressure") < pl.col("blood_pressure").mean()).alias(
        "lower than average"
    ),
    (pl.col("blood_pressure") > pl.col("blood_pressure").mean()).alias(
        "higher than average"
    ),
).filter(
    pl.col("timestamp").dt.month() == 1, pl.col("timestamp").dt.day() == 1
).group_by("name").agg(pl.col("lower than average", "higher than average").sum())

name,lower than average,higher than average
str,u32,u32
"""Chilly Willy""",53,43
"""Arctic Archie""",42,54
"""Icy Ingrid""",50,46
"""Blizzard Bob""",46,50
"""Peter Panda""",19,29


- Which polar bear has the highest risk of becoming a diabetic? (polar bears have a higher risk of becoming diabetic after going through a high blood sugar level episode. An episode is defined as a three-day or longer period of an average daily bgl of 200)

Approximate solution with group by:

In [11]:
measurements.select("name", "timestamp", "blood_glucose").group_by_dynamic(
    index_column="timestamp", every="1d", period="1d", group_by="name"
).agg(pl.col("blood_glucose").mean()).with_columns(
    pl.col("blood_glucose") > 200
).rolling(index_column="timestamp", period="3d", group_by="name").agg(
    pl.col("blood_glucose").sum()
).select("name", "blood_glucose").group_by("name").sum().sort("blood_glucose", "name", descending=True)

name,blood_glucose
str,u32
"""Peter Panda""",978
"""Icy Ingrid""",0
"""Cubby Coldpaws""",0
"""Chilly Willy""",0
"""Blizzard Bob""",0
"""Arctic Archie""",0


Correct solution, with `rle`

In [12]:
measurements.select("name", "timestamp", "blood_glucose").group_by_dynamic(
    index_column="timestamp", every="1d", period="1d", group_by="name"
).agg(pl.col("blood_glucose").mean()).with_columns(
    pl.col("blood_glucose") > 200
).group_by("name").agg(pl.col("blood_glucose").rle()).explode("blood_glucose").unnest(
    "blood_glucose"
).filter(pl.col("lengths") > 2, "values").group_by("name").sum().sort(
    "values", "name", descending=True
).rename({"lengths": "total days in episodes", "values": "number of episodes"})

name,total days in episodes,number of episodes
str,i32,u32
"""Peter Panda""",122,33


In [13]:
measurements.select("name", "timestamp", "blood_glucose").group_by_dynamic(
    index_column="timestamp", every="1d", period="1d", group_by="name"
).agg(pl.col("blood_glucose").mean()).filter(
    pl.col("name").is_in(["Blizzard Bob", "Peter Panda"]), pl.col("blood_glucose") > 190
).plot.scatter(x="timestamp", y="blood_glucose", by="name")

%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)
