In [1]:
import polars as pl

In [2]:
measurements = pl.read_parquet("data/measurements.parquet")
measurements

name,timestamp,blood_pressure,heart_rate,temperature,blood_glucose,sensor
str,datetime[μs],i16,i16,f32,i16,i8
"""Arctic Archie""",2020-04-20 00:04:50.480239,91,71,36.049999,189,2
"""Chilly Willy""",2020-04-20 00:06:00.077239,121,39,35.380001,101,2
"""Blizzard Bob""",2020-04-20 00:04:32.316505,131,70,36.040001,189,4
"""Arctic Archie""",2020-04-20 01:08:47.843849,81,64,36.169998,146,3
"""Chilly Willy""",2020-04-20 01:00:31.335780,74,68,36.040001,119,4
…,…,…,…,…,…,…
"""Peter Panda""",2024-04-18 23:07:42.837390,64,72,38.560001,201,4
"""Arctic Archie""",2024-04-18 23:02:55.436679,105,70,36.139999,159,3
"""Chilly Willy""",2024-04-18 23:00:09.368244,113,57,38.209999,202,2
"""Cubby Coldpaws""",2024-04-18 23:04:33.095812,85,130,36.189999,167,3


In [3]:
batch_measurements = pl.read_parquet("data/batch_measurements.parquet")
batch_measurements

name,vet,age,weight,daily_steps,timestamp,vet_health_check,life_stage
str,i8,i8,f32,i16,datetime[μs],str,str
"""Arctic Archie""",1,25,582.744507,8318,2020-04-20 06:15:00,"""HEALTHY""","""SENIOR"""
"""Chilly Willy""",2,5,496.005005,13675,2020-04-20 06:15:00,"""HEALTHY""","""ADULT"""
"""Blizzard Bob""",5,35,852.612427,6854,2020-04-20 06:15:00,"""INJURED""","""SENIOR"""
"""Arctic Archie""",2,25,592.455444,7713,2020-04-22 06:15:00,"""INJURED""","""SENIOR"""
"""Chilly Willy""",2,5,502.672821,10289,2020-04-22 06:15:00,"""HEALTHY""","""ADULT"""
…,…,…,…,…,…,…,…
"""Peter Panda""",5,2,319.055389,5215,2024-04-17 06:15:00,"""SICK""","""JUV"""
"""Arctic Archie""",2,29,596.792786,3877,2024-04-17 06:15:00,"""INJURED""","""SENIOR"""
"""Chilly Willy""",4,9,498.009399,13822,2024-04-17 06:15:00,"""HEALTHY""","""ADULT"""
"""Cubby Coldpaws""",5,0,2.410815,17398,2024-04-17 06:15:00,"""HEALTHY""","""CUB"""


## Basic Transforms: Filter, project, Union

- Was there in 2022 an injured polar bear older than 15 (i.e. a senior polar bear)?

In [4]:
batch_measurements.filter(
    pl.col("timestamp").dt.year() == 2022,
    pl.col("age") > 15,
    pl.col("vet_health_check") == "INJURED",
)

name,vet,age,weight,daily_steps,timestamp,vet_health_check,life_stage
str,i8,i8,f32,i16,datetime[μs],str,str
"""Arctic Archie""",2,27,600.978638,2103,2022-01-02 06:15:00,"""INJURED""","""SENIOR"""
"""Arctic Archie""",5,27,602.421082,17645,2022-01-08 06:15:00,"""INJURED""","""SENIOR"""
"""Blizzard Bob""",2,37,840.560364,6459,2022-01-12 06:15:00,"""INJURED""","""SENIOR"""
"""ARCTIC ARCHIE""",4,27,598.606201,17821,2022-01-18 06:15:00,"""INJURED""","""SENIOR"""
"""Blizzard Bob""",2,37,865.314453,13139,2022-01-24 06:15:00,"""INJURED""","""SENIOR"""
…,…,…,…,…,…,…,…
"""Blizzard Bob""",1,38,846.052246,9828,2022-12-20 06:15:00,"""INJURED""","""SENIOR"""
"""Blizzard Bob""",5,38,842.704041,16676,2022-12-26 06:15:00,"""INJURED""","""SENIOR"""
"""Arctic Archie""",1,28,608.15741,10494,2022-12-28 06:15:00,"""INJURED""","""SENIOR"""
"""Blizzard Bob""",2,38,844.760376,7440,2022-12-28 06:15:00,"""INJURED""","""SENIOR"""


- How many times was Blizzard Bob's name capitalized in the batch measurements?

In [5]:
batch_measurements.filter(name="BLIZZARD BOB").select(
    pl.len()
).item(), batch_measurements.filter(name="Blizzard Bob").select(pl.len()).item()

(106, 624)

 - Was Cubby Coldpaw ever sick with a temperature above 40 degrees? (tip: union + downfill)

In [6]:
filter = pl.col("name").str.contains("(?i)Cubby Coldpaws")
pl.concat(
    [measurements.filter(filter), batch_measurements.filter(filter)],
    how="diagonal_relaxed",
).sort(by="timestamp").fill_null(strategy="forward").filter(
    pl.col("vet_health_check") == "SICK", pl.col("temperature") > 39
)

name,timestamp,blood_pressure,heart_rate,temperature,blood_glucose,sensor,vet,age,weight,daily_steps,vet_health_check,life_stage
str,datetime[μs],i16,i16,f32,i16,i8,i8,i8,f32,i16,str,str
"""Cubby Coldpaws""",2024-04-03 08:04:54.349079,128,130,39.509998,223,2,3,0,1.991295,15985,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-03 09:05:56.451039,85,136,39.060001,158,2,3,0,1.991295,15985,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-03 12:04:41.117695,89,134,39.009998,161,3,3,0,1.991295,15985,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-03 14:05:34.633770,60,134,39.23,194,2,3,0,1.991295,15985,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-05 12:03:01.018255,88,138,39.16,158,3,5,0,2.046444,13040,"""SICK""","""CUB"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cubby Coldpaws""",2024-04-09 12:05:23.896249,95,108,39.27,148,2,2,0,2.161351,13975,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-10 12:02:02.962703,101,120,39.150002,136,2,2,0,2.161351,13975,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-10 16:00:03.955140,93,122,39.98,180,2,2,0,2.161351,13975,"""SICK""","""CUB"""
"""Cubby Coldpaws""",2024-04-14 11:05:08.643333,133,129,39.669998,162,1,3,0,2.282689,18855,"""SICK""","""CUB"""


# Windowing and Aggregations

- For every year, figure out which polar bear was the heaviest

In [7]:
batch_measurements.select("name", "timestamp", "weight").filter(
    pl.col("weight") == pl.col("weight").max().over(pl.col("timestamp").dt.year())
)

name,timestamp,weight
str,datetime[μs],f32
"""Blizzard Bob""",2020-08-12 06:15:00,875.465149
"""Blizzard Bob""",2021-09-02 06:15:00,878.228271
"""Blizzard Bob""",2022-08-22 06:15:00,874.462158
"""Blizzard Bob""",2023-10-04 06:15:00,878.402161
"""Blizzard Bob""",2024-02-11 06:15:00,877.566895


- When was the first and last measurement of each bear taken?

In [8]:
measurements.group_by("name").agg(
    pl.col("timestamp").first().alias("first measurement"),
    pl.col("timestamp").last().alias("last measurement"),
)

name,first measurement,last measurement
str,datetime[μs],datetime[μs]
"""Peter Panda""",2022-03-10 00:06:22.204451,2024-04-18 23:07:42.837390
"""Icy Ingrid""",2020-06-12 00:06:39.577116,2024-04-18 23:00:15.044912
"""Blizzard Bob""",2020-04-20 00:04:32.316505,2024-04-18 23:07:14.255318
"""Chilly Willy""",2020-04-20 00:06:00.077239,2024-04-18 23:00:09.368244
"""Arctic Archie""",2020-04-20 00:04:50.480239,2024-04-18 23:02:55.436679
"""Cubby Coldpaws""",2024-03-24 00:02:26.626629,2024-04-18 23:04:33.095812


- For each lifestage group of polar bears and for each year, which polar bear was the most active (most amount of steps per day)?

In [9]:
batch_measurements.filter(
    pl.col("daily_steps")
    == pl.col("daily_steps").max().over(pl.col("timestamp").dt.year(), "life_stage")
)

name,vet,age,weight,daily_steps,timestamp,vet_health_check,life_stage
str,i8,i8,f32,i16,datetime[μs],str,str
"""Chilly Willy""",3,5,516.934692,19939,2020-07-17 06:15:00,"""INJURED""","""ADULT"""
"""Blizzard Bob""",2,35,847.644836,19899,2020-07-25 06:15:00,"""INJURED""","""SENIOR"""
"""Icy Ingrid""",1,0,4.280061,19191,2020-10-25 06:15:00,"""SICK""","""CUB"""
"""Arctic Archie""",5,26,603.595886,19910,2021-02-04 06:15:00,"""INJURED""","""SENIOR"""
"""Icy Ingrid""",4,0,31.844097,19982,2021-04-13 06:15:00,"""HEALTHY""","""CUB"""
…,…,…,…,…,…,…,…
"""Chilly Willy""",4,8,496.30188,19991,2023-06-30 06:15:00,"""HEALTHY""","""ADULT"""
"""Icy Ingrid""",1,3,271.402832,19667,2024-01-26 06:15:00,"""SICK""","""JUV"""
"""Blizzard Bob""",2,39,876.487671,19834,2024-03-18 06:15:00,"""HEALTHY""","""SENIOR"""
"""Chilly Willy""",3,9,489.563416,19314,2024-03-24 06:15:00,"""INJURED""","""ADULT"""


- Find out which bears were more/less anxious (higher/lower blood pressure) than average after New Year's Eve (fireworks)?

In [10]:
measurements.with_columns(
    (pl.col("blood_pressure") < pl.col("blood_pressure").mean()).alias(
        "lower than average"
    ),
    (pl.col("blood_pressure") > pl.col("blood_pressure").mean()).alias(
        "higher than average"
    ),
).filter(
    pl.col("timestamp").dt.month() == 1, pl.col("timestamp").dt.day() == 1
).group_by(
    "name"
).agg(
    pl.col("lower than average", "higher than average").sum()
)

name,lower than average,higher than average
str,u32,u32
"""Blizzard Bob""",50,46
"""Icy Ingrid""",47,49
"""Peter Panda""",26,22
"""Chilly Willy""",46,50
"""Arctic Archie""",58,38


- Which polar bear has the highest risk of becoming a diabetic? (polar bears have a higher risk of becoming diabetic after going through a high blood sugar level episode. An episode is defined as a three-day or longer period of an average daily bgl of 200)

Approximate solution with group by:

In [11]:
measurements.select("name", "timestamp", "blood_glucose").group_by_dynamic(
    index_column="timestamp", every="1d", period="1d", group_by="name"
).agg(pl.col("blood_glucose").mean()).with_columns(
    pl.col("blood_glucose") > 200
).rolling(
    index_column="timestamp", period="3d", group_by="name"
).agg(
    pl.col("blood_glucose").sum()
).select(
    "name", "blood_glucose"
).group_by(
    "name"
).sum().sort(
    "blood_glucose", "name", descending=True
)

name,blood_glucose
str,u32
"""Peter Panda""",992
"""Blizzard Bob""",36
"""Icy Ingrid""",0
"""Cubby Coldpaws""",0
"""Chilly Willy""",0
"""Arctic Archie""",0


Correct solution, with `rle`

In [12]:
measurements.select("name", "timestamp", "blood_glucose").group_by_dynamic(
    index_column="timestamp", every="1d", period="1d", group_by="name"
).agg(pl.col("blood_glucose").mean()).with_columns(
    pl.col("blood_glucose") > 200
).group_by(
    "name"
).agg(
    pl.col("blood_glucose").rle()
).explode(
    "blood_glucose"
).unnest(
    "blood_glucose"
).filter(
    pl.col("lengths") > 2, "values"
).group_by(
    "name"
).sum().sort(
    "values", "name", descending=True
).rename(
    {"lengths": "total days in episodes", "values": "number of episodes"}
)

name,total days in episodes,number of episodes
str,i32,u32
"""Peter Panda""",129,38


In [13]:
measurements.select("name", "timestamp", "blood_glucose").group_by_dynamic(
    index_column="timestamp", every="1d", period="1d", group_by="name"
).agg(pl.col("blood_glucose").mean()).filter(
    pl.col("name").is_in(["Blizzard Bob", "Peter Panda"]), pl.col("blood_glucose") > 190
).plot.scatter(
    x="timestamp", y="blood_glucose", by="name"
)

%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)


# Joins

How many times has vet practice Digital Wildlife Care diagnosed a polar bear as sick?

In [14]:
dim_vet = pl.read_parquet("data/dim_vet.parquet").select(pl.all().shrink_dtype())
dim_vet

vet,name,practice
i8,str,str
1,"""Zooey Codewell""","""TechSavvy Vets"""
2,"""Script Safari""","""Digital Wildlife Care"""
3,"""Debug Dino""","""Digital Wildlife Care"""
4,"""Pixel Paws""","""CodePaws Veterinary Clinic"""
5,"""Byte Lyon""","""TechSavvy Vets"""


In [15]:
batch_measurements.join(
    other=dim_vet.filter(practice="Digital Wildlife Care"), on="vet", how="inner"
).filter(vet_health_check="SICK").select(pl.len())

len
u32
546


In [16]:
dim_vet.join(
    other=batch_measurements.filter(
        pl.col("weight") > pl.col("weight").quantile(0.999)
    ),
    on="vet",
    how="anti",
)

vet,name,practice
i8,str,str
2,"""Script Safari""","""Digital Wildlife Care"""
3,"""Debug Dino""","""Digital Wildlife Care"""
5,"""Byte Lyon""","""TechSavvy Vets"""


Which vet was the least consistent in name capitalization?

In [17]:
batch_measurements.select("name", "vet").with_columns(
    (pl.col("name") == pl.col("name").str.to_uppercase()).alias("is_misspelt")
).group_by("vet").agg(pl.col("is_misspelt").sum() / pl.len()).join(
    other=dim_vet, on="vet", how="left"
).drop("vet").sort("is_misspelt", descending=True)

is_misspelt,name,practice
f64,str,str
0.468892,"""Byte Lyon""","""TechSavvy Vets"""
0.230999,"""Pixel Paws""","""CodePaws Veterinary Clinic"""
0.117647,"""Script Safari""","""Digital Wildlife Care"""
0.0,"""Debug Dino""","""Digital Wildlife Care"""
0.0,"""Zooey Codewell""","""TechSavvy Vets"""


Which day had the lowest bear-to-visitor ratio?

In [18]:
visitors = pl.read_parquet("data/visitors.parquet").sort("timestamp")
visitors

timestamp,visitors
datetime[μs],i16
2020-04-20 18:00:00,2057
2020-04-21 18:00:00,2264
2020-04-22 18:00:00,2388
2020-04-23 18:00:00,2041
2020-04-24 18:00:00,2038
…,…
2024-04-13 18:00:00,2545
2024-04-14 18:00:00,2665
2024-04-15 18:00:00,2392
2024-04-16 18:00:00,2292


In [19]:
visible_bears_per_day = batch_measurements.filter(vet_health_check="HEALTHY").group_by("timestamp").len(
    name="bears"
).sort("timestamp")
visible_bears_per_day

timestamp,bears
datetime[μs],u32
2020-04-20 06:15:00,2
2020-04-22 06:15:00,1
2020-04-30 06:15:00,1
2020-05-06 06:15:00,1
2020-05-08 06:15:00,1
…,…
2024-04-07 06:15:00,1
2024-04-09 06:15:00,3
2024-04-11 06:15:00,3
2024-04-13 06:15:00,1


In [20]:
visitors.join_asof(other=visible_bears_per_day, on="timestamp").with_columns(
    (pl.col("bears") / pl.col("visitors")).alias("ratio")
).sort("ratio")

timestamp,visitors,bears,ratio
datetime[μs],i16,u32,f64
2023-07-02 18:00:00,2799,1,0.000357
2021-01-24 18:00:00,2798,1,0.000357
2021-07-17 18:00:00,2795,1,0.000358
2023-10-21 18:00:00,2790,1,0.000358
2024-04-06 18:00:00,2790,1,0.000358
…,…,…,…
2024-02-08 18:00:00,2079,4,0.001924
2023-02-02 18:00:00,2040,4,0.001961
2023-01-12 18:00:00,2039,4,0.001962
2024-01-01 18:00:00,2022,4,0.001978
