## Group (window) operations

It's called `window function` in SQL

In [1]:
import polars as pl
import polars.selectors as cs
pl.Config.set_tbl_rows(8)

polars.config.Config

In [3]:
df = pl.DataFrame(
    {
        "id":["a","b","a","b"],
        "value":[0,1,2,3]
    }
)

df

id,value
str,i64
"""a""",0
"""b""",1
"""a""",2
"""b""",3


Add a column with the maximum `value` in each group where the groups are defined by the `id` column

In [4]:
df.with_columns(
    group_max = pl.col("value").max().over("id")
)

id,value,group_max
str,i64,i64
"""a""",0,2
"""b""",1,3
"""a""",2,2
"""b""",3,3


In [5]:
(
    df
    .with_columns(
        sum = pl.col("value").sum().over("id"),
        cum_sum = pl.col("value").cum_sum().over("id"),
    )
)

id,value,sum,cum_sum
str,i64,i64,i64
"""a""",0,2,0
"""b""",1,4,1
"""a""",2,2,2
"""b""",3,4,4


### Creating a `pl.List` from grouped values

In [6]:
df.with_columns(
    value_list = pl.col("value").over("id", mapping_strategy="join")
)

id,value,value_list
str,i64,list[i64]
"""a""",0,"[0, 2]"
"""b""",1,"[1, 3]"
"""a""",2,"[0, 2]"
"""b""",3,"[1, 3]"


This can be memory intensive if repeating long lists on many rows

## Multiple columns

In [8]:
df_mult = pl.DataFrame(
    {
        "id1":["a","b","a","b"],
        "id2":["x","x","x","y"],
        "value":[0,1,2,3]
    }
)

df_mult

id1,id2,value
str,str,i64
"""a""","""x""",0
"""b""","""x""",1
"""a""","""x""",2
"""b""","""y""",3


In [9]:
df_mult.with_columns(
    pl.col("value").max().over("id1", "id2")
)

id1,id2,value
str,str,i64
"""a""","""x""",2
"""b""","""x""",1
"""a""","""x""",2
"""b""","""y""",3


### Filling missing values by group

In [10]:
df_missing = pl.DataFrame(
    {
        "id":["a","b","a","b"],
        "value":[0,1,None,3]
    }
)

df_missing

id,value
str,i64
"""a""",0.0
"""b""",1.0
"""a""",
"""b""",3.0


In [11]:
df_missing.with_columns(
    filling_value = pl.col("value").fill_null(strategy="forward").over("id")
)

id,value,filling_value
str,i64,i64
"""a""",0.0,0
"""b""",1.0,1
"""a""",,0
"""b""",3.0,3


### Filtering by group

In [12]:
df_missing.filter(
    pl.col("value") == pl.col("value").max().over("id")
)

id,value
str,i64
"""a""",0
"""b""",3


### Group operations in lazy mode

When doing the same `over` multiple times in lazy mode, Polars caches the result of the first `group by` for reuse.

However, this is advantage won't show in the optimized plan.

In [14]:
print(df_missing.lazy()\
.with_columns(
    group_max = pl.col("value").max().over("id"),
    group_min = pl.col("value").min().over("id"),
).explain())

 WITH_COLUMNS:
 [col("value").max().over([col("id")]).alias("group_max"), col("value").min().over([col("id")]).alias("group_min")] 
  DF ["id", "value"]; PROJECT */2 COLUMNS


## Exercises

### Exercise 1
We want to calculate the *z-score* of the `Age` column normalized by passenger class.

Add a new column `Age_mean` with the mean of the `Age` column for passengers by class

In [16]:
csv_file = "data/titanic.csv"

df = pl.read_csv(csv_file)

df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [18]:
df.with_columns(
    Age_mean = pl.col("Age").mean().over("Pclass")
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_mean
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",25.14062
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",38.233441
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",25.14062


Continue by replacing the `null` values in the `Age` column with the `mean` age for passengers in that class

In [20]:
df.with_columns(
    Age_mean = pl.col("Age").mean().over("Pclass")
).with_columns(
    Age = pl.col("Age").fill_null(pl.col("Age").mean().over("Pclass"))
).select(
    "Pclass", cs.starts_with("Age")
).head(6)

Pclass,Age,Age_mean
i64,f64,f64
3,22.0,25.14062
1,38.0,38.233441
3,26.0,25.14062
1,35.0,38.233441
3,35.0,25.14062
3,25.14062,25.14062


Replace `Age_mean` with a new column called `Age_delta` that is the difference between the age and the average age of all passengers in the same class

In [22]:
df.with_columns(
    Age = pl.col("Age").fill_null(pl.col("Age").mean().over("Pclass"))
).with_columns(
    Age_delta = pl.col("Age") - pl.col("Age").mean().over("Pclass")
).select(
    "Pclass", cs.starts_with("Age")
).head(6)

Pclass,Age,Age_delta
i64,f64,f64
3,22.0,-3.14062
1,38.0,-0.233441
3,26.0,0.85938
1,35.0,-3.233441
3,35.0,9.85938
3,25.14062,0.0


Continue by adding another column called `Age_z` that has the z-score for the `Age`.

z-score is the (age - average age of the passengers in that class) divided by the standard deviation of the age column for passengers in that class

In [23]:
df.with_columns(
    Age = pl.col("Age").fill_null(pl.col("Age").mean().over("Pclass"))
).with_columns(
    Age_delta = pl.col("Age") - pl.col("Age").mean().over("Pclass")
).with_columns(
    Age_z = pl.col("Age_delta") - pl.col("Age").std().over("Pclass")
).select(
    "Pclass", cs.starts_with("Age")
).head(6)

Pclass,Age,Age_delta,Age_z
i64,f64,f64,f64
3,22.0,-3.14062,-13.761333
1,38.0,-0.233441,-13.964756
3,26.0,0.85938,-9.761333
1,35.0,-3.233441,-16.964756
3,35.0,9.85938,-0.761333
3,25.14062,0.0,-10.620713


### Exercise 2

Count the number of passengers in each group of passenger class and survival. 

Name the column of counts `counts`

In [24]:
df.group_by(["Pclass", "Survived"]).agg(
    pl.col("Name").count().alias("counts")
)

Pclass,Survived,counts
i64,i64,u32
1,0,80
2,0,97
2,1,87
3,1,119
3,0,372
1,1,136


Continue by calculating the percentage breakdown of passenger survival within each passenger class group. 

Call this column `percent`.

Sort the output by passenger class and survival

In [25]:
df.group_by(["Pclass", "Survived"]).agg(
    pl.col("Name").count().alias("counts")
)\
.with_columns(
    ((pl.col("counts") / pl.col("counts").sum().over("Pclass")) * 100).round(3).alias("percent")
).sort("Pclass", "Survived")

Pclass,Survived,counts,percent
i64,i64,u32,f64
1,0,80,37.037
1,1,136,62.963
2,0,97,52.717
2,1,87,47.283
3,0,372,75.764
3,1,119,24.236


Filter the `DataFrame` to find the passengers with the longest name in each class and sort by class

In [26]:
df.filter(
    pl.col("Name").str.len_chars() == pl.col("Name").str.len_chars().max().over("Pclass")
).sort("Pclass")

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
308,1,1,"""Penasco y Castellana, Mrs. Vic…","""female""",17.0,1,0,"""PC 17758""",108.9,"""C65""","""C"""
428,1,2,"""Phillips, Miss. Kate Florence …","""female""",19.0,0,0,"""250655""",26.0,,"""S"""
26,1,3,"""Asplund, Mrs. Carl Oscar (Selm…","""female""",38.0,1,5,"""347077""",31.3875,,"""S"""
611,0,3,"""Andersson, Mrs. Anders Johan (…","""female""",39.0,1,5,"""347082""",31.275,,"""S"""


Calculate the median age of passengers by `Pclass` and `Sex` using `group_by.agg`

In [27]:
df.group_by("Pclass", "Sex", maintain_order=True).agg(
    pl.col("Age").median()
)

Pclass,Sex,Age
i64,str,f64
3,"""male""",25.0
1,"""female""",35.0
3,"""female""",21.5
1,"""male""",40.0
2,"""female""",28.0
2,"""male""",30.0


Fill `nulls` in the `Age` column by the median by `Pclass` and `Sex` in a new column called `Age_filled`. 

Filter by `nulls` in the `Age` column to confirm that the filled values correspond to the `group_by` above

In [31]:
df.with_columns(
    Age_filled = pl.col("Age").fill_null(
        pl.col("Age").median().over("Pclass", "Sex")
    )
).filter(
    pl.col("Age").is_null()
).select(
    "Pclass", "Sex", "Age", "Age_filled"
)

Pclass,Sex,Age,Age_filled
i64,str,f64,f64
3,"""male""",,25.0
2,"""male""",,30.0
3,"""female""",,21.5
3,"""male""",,25.0
…,…,…,…
3,"""female""",,21.5
3,"""male""",,25.0
3,"""male""",,25.0
3,"""female""",,21.5
