# Group iteration and aggregations

In [1]:
import polars as pl
import polars.selectors as cs

In [2]:
csv_file = 'data/titanic.csv'

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Iterating over groups

The group key is a `tuple` even when we are only grouping by one column.

In [4]:
for (pclass,), group_df in df.group_by("Pclass"): # group_df is the rest columns
    print(f"Pclass: {pclass}")
    print(group_df.mean())

Pclass: 1
shape: (1, 12)
┌─────────────┬──────────┬────────┬──────┬───┬────────┬───────────┬───────┬──────────┐
│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare      ┆ Cabin ┆ Embarked │
│ ---         ┆ ---      ┆ ---    ┆ ---  ┆   ┆ ---    ┆ ---       ┆ ---   ┆ ---      │
│ f64         ┆ f64      ┆ f64    ┆ str  ┆   ┆ str    ┆ f64       ┆ str   ┆ str      │
╞═════════════╪══════════╪════════╪══════╪═══╪════════╪═══════════╪═══════╪══════════╡
│ 461.597222  ┆ 0.62963  ┆ 1.0    ┆ null ┆ … ┆ null   ┆ 84.154688 ┆ null  ┆ null     │
└─────────────┴──────────┴────────┴──────┴───┴────────┴───────────┴───────┴──────────┘
Pclass: 2
shape: (1, 12)
┌─────────────┬──────────┬────────┬──────┬───┬────────┬───────────┬───────┬──────────┐
│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare      ┆ Cabin ┆ Embarked │
│ ---         ┆ ---      ┆ ---    ┆ ---  ┆   ┆ ---    ┆ ---       ┆ ---   ┆ ---      │
│ f64         ┆ f64      ┆ f64    ┆ str  ┆   ┆ str    ┆ f64       ┆ str   ┆ str 

When grouping by multiple columns, the first element as a `tuple` naturally extends to multiple group keys

In [5]:
for (pclass,survived), group_df in df.group_by("Pclass", "Survived"):
    print(f"Pclass: {pclass}, Survived: {survived}")
    print(group_df.mean())

Pclass: 1, Survived: 0
shape: (1, 12)
┌─────────────┬──────────┬────────┬──────┬───┬────────┬───────────┬───────┬──────────┐
│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare      ┆ Cabin ┆ Embarked │
│ ---         ┆ ---      ┆ ---    ┆ ---  ┆   ┆ ---    ┆ ---       ┆ ---   ┆ ---      │
│ f64         ┆ f64      ┆ f64    ┆ str  ┆   ┆ str    ┆ f64       ┆ str   ┆ str      │
╞═════════════╪══════════╪════════╪══════╪═══╪════════╪═══════════╪═══════╪══════════╡
│ 410.3       ┆ 0.0      ┆ 1.0    ┆ null ┆ … ┆ null   ┆ 64.684008 ┆ null  ┆ null     │
└─────────────┴──────────┴────────┴──────┴───┴────────┴───────────┴───────┴──────────┘
Pclass: 2, Survived: 1
shape: (1, 12)
┌─────────────┬──────────┬────────┬──────┬───┬────────┬─────────┬───────┬──────────┐
│ PassengerId ┆ Survived ┆ Pclass ┆ Name ┆ … ┆ Ticket ┆ Fare    ┆ Cabin ┆ Embarked │
│ ---         ┆ ---      ┆ ---    ┆ ---  ┆   ┆ ---    ┆ ---     ┆ ---   ┆ ---      │
│ f64         ┆ f64      ┆ f64    ┆ str  ┆   ┆ str    ┆ f64 

## Group values
We use `head` to get the first rows in each group.

> **Note** The returned rows depending on how many groups we have.

In [6]:
df.group_by("Pclass").head(2)

Pclass,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,10,1,"""Nasser, Mrs. Nicholas (Adele A…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""
2,16,1,"""Hewlett, Mrs. (Mary D Kingcome…","""female""",55.0,0,0,"""248706""",16.0,,"""S"""
1,2,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
1,4,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
3,1,0,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,3,1,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Calling aggregations directly on `group_by`

In [7]:
df.group_by("Pclass").len() # Calculate the count

Pclass,len
i64,u32
1,216
3,491
2,184


The methods we can all on `GroupBy` include:
 - `first` get the first element of each group
 - `last` get the last element of each group
 - `n_unique` get the number of unique elements in each group
 - `count` get the number of elements in each group
 - `sum` sum the elements in each group
 - `min` get the smallest element in each group
 - `max` get the largest element in each group
 - `mean` get the average of elements in each group
 - `median` get the median in each group
 - `quantile` calculate quantiles in each group

We can also call aggregations on a lazy group though not all of the above are supported
 

## Multiple aggregations on the same columns

In [16]:
group_column = "Pclass"

df.group_by(group_column)\
.agg(
    pl.col(pl.Float64).min().name.suffix("_min"),
    pl.col(pl.Float64).max().name.suffix("_max"),
)\
.pipe(
    lambda df: df.select(
        [group_column] + sorted(df.columns[1:])
    )
)

Pclass,Age_max,Age_min,Fare_max,Fare_min
i64,f64,f64,f64,f64
3,74.0,0.42,69.55,0.0
1,80.0,0.92,512.3292,0.0
2,70.0,0.67,73.5,0.0


Use `Selector API`

In [18]:
group_column = "Pclass"

df.group_by(group_column)\
.agg(
    cs.float().min().name.suffix("_min"),
    cs.float().max().name.suffix("_max"),
)\
.pipe(
    lambda df: df.select(
        [group_column] + sorted(df.columns[1:])
    )
)

Pclass,Age_max,Age_min,Fare_max,Fare_min
i64,f64,f64,f64,f64
1,80.0,0.92,512.3292,0.0
2,70.0,0.67,73.5,0.0
3,74.0,0.42,69.55,0.0


## User-defined functions on groups
We can define user-defined functions on groups with `map_groups`.

In [20]:
df.group_by("Pclass")\
.map_groups(
    lambda group_df: group_df.max()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
887,1,2,"""del Carlo, Mr. Sebastiano""","""male""",70.0,3,3,"""W/C 14208""",73.5,"""F4""","""S"""
891,1,3,"""van Melkebeke, Mr. Philemon""","""male""",74.0,8,6,"""W./C. 6609""",69.55,"""G6""","""S"""
890,1,1,"""Young, Miss. Marie Grice""","""male""",80.0,3,4,"""WE/P 5735""",512.3292,"""T""","""S"""


In [23]:
df.group_by("Pclass")\
.map_groups(
    lambda group_df: group_df.select(pl.col(pl.Float64)).head(2)
)

Age,Fare
f64,f64
38.0,71.2833
35.0,53.1
22.0,7.25
26.0,7.925
14.0,30.0708
55.0,16.0


> The UDF in Polars will break the select optimizer, hence, using the built-in Polars function in advanced.

## Exercises

### Exercise 1
Group by the `Pclass` column. 

Count the number of passengers in each group without using `agg`

In [28]:
df.group_by("Pclass").len()

Pclass,len
i64,u32
1,216
3,491
2,184


Add a column called `percent` with the percentage of the total passengers in each group

In [33]:
df.group_by("Pclass").len()\
.with_columns(
    percent = ((pl.col("len") / pl.col("len").sum()) *100)
)

Pclass,len,percent
i64,u32,f64
2,184,20.650954
3,491,55.106622
1,216,24.242424


Create a bar chart of the `percent` column with the title `% per class"`

In [39]:
df.group_by("Pclass").len()\
.with_columns(
    pl.col("Pclass").cast(pl.Utf8),
    percent = ((pl.col("len") / pl.col("len").sum()) *100)
)\
.plot\
.bar(
    x="Pclass",
    y="percent",
    color="Pclass"
)\
.properties(title="% per class", width=500)

### Exercise 2
Create a `DataFrame` from the Spotify data

In [40]:
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(10)
spotify_csv = "data/spotify-charts-2017-2021-global-top200.csv.gz"
spotify_df = pl.read_csv(spotify_csv,try_parse_dates=True)
spotify_df.head(3)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,date,str,str,str,str,str,i64
"""Starboy""",1,2017-01-01,"""The Weeknd, Daft Punk""","""https://open.spotify.com/track/5aAx2yezTd8zXrkmtKl66Z""","""Global""","""top200""","""SAME_POSITION""",3135625
"""Closer""",2,2017-01-01,"""The Chainsmokers, Halsey""","""https://open.spotify.com/track/7BKLCZ1jbUBVqRi2FVlTVw""","""Global""","""top200""","""SAME_POSITION""",3015525
"""Let Me Love You""",3,2017-01-01,"""DJ Snake, Justin Bieber""","""https://open.spotify.com/track/4pdPtRcBmOSQDlJ3Fk945m""","""Global""","""top200""","""MOVE_UP""",2545384


We want to inspect some data for the top-streaming artists by printing it out:
- filter `spotify_df` to include only rows that had more than 10 million streams
- `group_by` the `artist` column
- ensure the order of the output is the same each time
- print the `artist` key
- print the sub-`DataFrame`

In [41]:
for (artist,), artist_df in (
    spotify_df.filter(
        pl.col("streams") > 10_000_000
    ).group_by(
        "artist", maintain_order=True
    )
):
    print(artist)
    print(artist_df)

XXXTENTACION
shape: (1, 9)
┌───────┬──────┬────────────┬──────────────┬───┬────────┬────────┬─────────┬──────────┐
│ title ┆ rank ┆ date       ┆ artist       ┆ … ┆ region ┆ chart  ┆ trend   ┆ streams  │
│ ---   ┆ ---  ┆ ---        ┆ ---          ┆   ┆ ---    ┆ ---    ┆ ---     ┆ ---      │
│ str   ┆ i64  ┆ date       ┆ str          ┆   ┆ str    ┆ str    ┆ str     ┆ i64      │
╞═══════╪══════╪════════════╪══════════════╪═══╪════════╪════════╪═════════╪══════════╡
│ SAD!  ┆ 1    ┆ 2018-06-19 ┆ XXXTENTACION ┆ … ┆ Global ┆ top200 ┆ MOVE_UP ┆ 10415088 │
└───────┴──────┴────────────┴──────────────┴───┴────────┴────────┴─────────┴──────────┘
Ariana Grande
shape: (2, 9)
┌──────────────────┬──────┬────────────┬────────┬───┬────────┬────────┬─────────────────┬──────────┐
│ title            ┆ rank ┆ date       ┆ artist ┆ … ┆ region ┆ chart  ┆ trend           ┆ streams  │
│ ---              ┆ ---  ┆ ---        ┆ ---    ┆   ┆ ---    ┆ ---    ┆ ---             ┆ ---      │
│ str              ┆ i64  

Repeat this exercise but in this case grouping by the `artist` and `title` column and printing the artist and title for each group

In [42]:
for (artist,title), artist_df in (
    spotify_df.filter(
        pl.col("streams") > 10_000_000
    ).group_by(
        "artist", "title", maintain_order=True
    )
):
    print(artist, title)
    print(artist_df)

XXXTENTACION SAD!
shape: (1, 9)
┌───────┬──────┬────────────┬──────────────┬───┬────────┬────────┬─────────┬──────────┐
│ title ┆ rank ┆ date       ┆ artist       ┆ … ┆ region ┆ chart  ┆ trend   ┆ streams  │
│ ---   ┆ ---  ┆ ---        ┆ ---          ┆   ┆ ---    ┆ ---    ┆ ---     ┆ ---      │
│ str   ┆ i64  ┆ date       ┆ str          ┆   ┆ str    ┆ str    ┆ str     ┆ i64      │
╞═══════╪══════╪════════════╪══════════════╪═══╪════════╪════════╪═════════╪══════════╡
│ SAD!  ┆ 1    ┆ 2018-06-19 ┆ XXXTENTACION ┆ … ┆ Global ┆ top200 ┆ MOVE_UP ┆ 10415088 │
└───────┴──────┴────────────┴──────────────┴───┴────────┴────────┴─────────┴──────────┘
Ariana Grande 7 rings
shape: (1, 9)
┌─────────┬──────┬────────────┬───────────────┬───┬────────┬────────┬───────────────┬──────────┐
│ title   ┆ rank ┆ date       ┆ artist        ┆ … ┆ region ┆ chart  ┆ trend         ┆ streams  │
│ ---     ┆ ---  ┆ ---        ┆ ---           ┆   ┆ ---    ┆ ---    ┆ ---           ┆ ---      │
│ str     ┆ i64  ┆ date  

Find the total number of streams by artist for tracks that are number 1 in the charts. 

Divide the number of streams by 1 million to make it easier to read and sort from high to low

In [50]:
spotify_df.filter(
    pl.col("rank") == 1
).group_by(
    "artist", "title"
).agg(
    streams = (pl.col("streams").sum() / 1_000_000)
).sort("streams", descending=True)

artist,title,streams
str,str,f64
"""Tones And I""","""Dance Monkey""",774.484772
"""Shawn Mendes, Camila Cabello""","""Señorita""",742.596967
"""Ed Sheeran""","""Shape of You""",702.18693
"""The Kid LAROI""","""STAY (with Justin Bieber)""",677.106038
"""Post Malone, 21 Savage""","""rockstar""",621.760044
…,…,…
"""Lil Wayne""","""Mona Lisa (feat. Kendrick Lamar)""",5.808198
"""Eminem""","""The Ringer""",5.18659
"""Billie Eilish""","""my future""",5.183389
"""Post Malone""","""Wow.""",4.570136


Using one of the methods we can call directly on `group_by` find out how many distinct tracks each artist has. 

Sort the values from high to low

In [53]:
spotify_df.group_by("artist").n_unique().select(
    "artist", "title"
).sort("title", descending=True)

# n_unique counts the distinct values

artist,title
str,u32
"""Taylor Swift""",128
"""Drake""",95
"""BTS""",76
"""Juice WRLD""",72
"""Eminem""",63
…,…
"""YTB Trench, Young Stoner Life, Young Thug, Gunna""",1
"""Chance the Rapper, TisaKorean, Murda Beatz""",1
"""Lil Kleine, Boef""",1
"""Drake, Static Major, Ty Dolla $ign""",1
