# Group by and aggregations

In [1]:
import polars as pl

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Group-by and aggregation
In Polars we can group by a column and aggregate the data in other columns with the `group_by.agg` combination.

In [None]:
df.group_by("Pclass")\
.agg(
    pl.col("Fare").mean()
)

Pclass,Fare
i64,f64
2,20.662183
3,13.67555
1,84.154687


What if we don't pass an aggregation expression?

In [5]:
(
    df
    .group_by("Pclass")
    .agg(
        pl.col("Fare").head(2)
    )
)

Pclass,Fare
i64,list[f64]
2,"[30.0708, 16.0]"
1,"[71.2833, 53.1]"
3,"[7.25, 7.925]"


It comes with a `pl.List` data type.

## Grouping by multiple columns

In [6]:
df.group_by(
    ["Pclass", "Survived"]
)\
.agg(
    pl.col("Fare").mean()
)

Pclass,Survived,Fare
i64,i64,f64
3,0,13.669364
3,1,13.694887
1,0,64.684007
1,1,95.608029
2,1,22.0557
2,0,19.412328


Use expression inside `group_by`

In [7]:
df.group_by(
    pl.col("Age").cast(pl.Int64)
)\
.agg(
    pl.col("Fare").mean()
).head()

Age,Fare
i64,f64
24,41.907119
51,28.752386
8,28.3
66,10.5
28,20.327156


## Ordering of the output

Force the order of the output with the `maintain_order` argument

In [8]:
df.group_by("Pclass", maintain_order=True)\
.agg(
    pl.col("Fare").mean()
)

Pclass,Fare
i64,f64
3,13.67555
1,84.154687
2,20.662183


The first row is group `3` because the first row of `df` is `3` and so on.

Setting `maintain_order=True` will affect performance to some extent. 

We can't use the streaming engine for large datasets when `maintain_order=True` either.

## Group by in lazy mode

In [9]:
print(
    pl.scan_csv(csv_file)
    .group_by("Pclass")
    .agg(
        pl.col("Fare").mean()
    )
    .explain()
)

AGGREGATE[maintain_order: false]
  [col("Fare").mean()] BY [col("Pclass")]
  FROM
  Csv SCAN [data/titanic.csv]
  PROJECT 2/12 COLUMNS
  ESTIMATED ROWS: 971


### Streaming group_by on large datasets
If `maintain_order=True` then `group_by` cannot be run for large datasets in streaming mode.


In [13]:
print(
    pl.scan_csv(csv_file)
    .group_by("Pclass", maintain_order=True)
    .agg(
        pl.col("Fare").mean()
    )
    .explain(engine="streaming")
)

AGGREGATE[maintain_order: true]
  [col("Fare").mean()] BY [col("Pclass")]
  FROM
  Csv SCAN [data/titanic.csv]
  PROJECT 2/12 COLUMNS
  ESTIMATED ROWS: 971


## Group_by on a sorted column

Based on the `fast-track algorithm`, Polars can run faster when it knows the column is sorted.

## Group by on a list
We can `groupby` on a list column just as for non-list columns.

In [16]:
df_lists = (
    pl.DataFrame(
            {
                "lists": [
                    ["a", "b"],
                    ["a", "c"],
                    ["a", "b"],
                ]
            }
    )
    .with_row_index()
)
df_lists

index,lists
u32,list[str]
0,"[""a"", ""b""]"
1,"[""a"", ""c""]"
2,"[""a"", ""b""]"


`group_by` and count the number of `occurrences` of each list

In [17]:
df_lists\
.group_by("lists")\
.len()

lists,len
list[str],u32
"[""a"", ""c""]",1
"[""a"", ""b""]",2


## Exercises

### Exercises 1
Group by the `Pclass` and `Survived` columns and count the number of passengers in each group. 

Ensure the order is the same as the input order

In [24]:
df = pl.read_csv(csv_file)

In [25]:
df.group_by(
    "Pclass", "Survived", maintain_order=True
)\
.agg(
    pl.col("Age").len().alias("len")
)

Pclass,Survived,len
i64,i64,u32
3,0,372
1,1,136
…,…,…
2,1,87
2,0,97


Did people with longer names pay more for their ticket?

Group by the number of characters in the `Name` column and get the average `Fare` for each name length

In [26]:
df.group_by(
    pl.col("Name").str.len_chars()
)\
.agg(
    pl.col("Fare").mean()
)

Name,Fare
u32,f64
57,31.33125
24,34.256681
…,…
13,56.4958
49,64.40082


Make a scatter plot of the output with `plot.scatter`

In [34]:
df.group_by(
    pl.col("Name").str.len_chars()
)\
.agg(
    pl.col("Fare").mean()
)\
.plot\
.scatter(
    x="Name",
    y="Fare",
    color="Name",
)\
.properties(width=500)

### Exercise 2
We create a `DataFrame` from the Spotify data

In [18]:
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(10)
spotify_csv = "data/spotify-charts-2017-2021-global-top200.csv.gz"
spotify_df = pl.read_csv(spotify_csv,try_parse_dates=True)
spotify_df.head(3)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,date,str,str,str,str,str,i64
"""Starboy""",1,2017-01-01,"""The Weeknd, Daft Punk""","""https://open.spotify.com/track/5aAx2yezTd8zXrkmtKl66Z""","""Global""","""top200""","""SAME_POSITION""",3135625
"""Closer""",2,2017-01-01,"""The Chainsmokers, Halsey""","""https://open.spotify.com/track/7BKLCZ1jbUBVqRi2FVlTVw""","""Global""","""top200""","""SAME_POSITION""",3015525
"""Let Me Love You""",3,2017-01-01,"""DJ Snake, Justin Bieber""","""https://open.spotify.com/track/4pdPtRcBmOSQDlJ3Fk945m""","""Global""","""top200""","""MOVE_UP""",2545384


Format the floating point values so that large floating point numbers are separated by a comma (or your preferred thousand separator).

In [35]:
pl.Config.set_thousands_separator(",")

polars.config.Config

Group by the `artist` and `title` columns and get the maximum of the other columns. 

Sort the output with the largest values of streams first

In [42]:
spotify_df.group_by(
    "artist", "title"
)\
.agg(
    pl.all().max()
)\
.sort("streams", descending=True)

artist,title,rank,date,url,region,chart,trend,streams
str,str,i64,date,str,str,str,str,i64
"""Adele""","""Easy On Me""",140,2021-12-20,"""https://open.spotify.com/track/46IZ0fSY2mpAiktS3KOqds""","""Global""","""top200""","""SAME_POSITION""",19749704
"""Mariah Carey""","""All I Want for Christmas Is You""",198,2021-12-20,"""https://open.spotify.com/track/0bYg9bo50gSsH3LtXe2SQn""","""Global""","""top200""","""SAME_POSITION""",17223237
…,…,…,…,…,…,…,…,…
"""Maroon 5""","""Sugar""",196,2017-01-02,"""https://open.spotify.com/track/494OU6M7NOf4ICYb4zWCf5""","""Global""","""top200""","""NEW_ENTRY""",335115
"""JAY-Z, Kanye West""","""Ni**as In Paris""",200,2017-01-01,"""https://open.spotify.com/track/2KpCpk6HjXXLb7nnXoXA5O""","""Global""","""top200""","""NEW_ENTRY""",325951


Group by the number of artists listed in `artist` column and then take the mean of the streams column. 

Sort by the number of artists

In [44]:
spotify_df.group_by(
    number_of_artists = pl.col("artist").str.split(",").list.len()
)\
.agg(
    pl.col("streams").mean()
)\
.sort("number_of_artists")

number_of_artists,streams
u32,f64
1,1222099.790673
2,1203421.091801
…,…
10,808568.0
18,750212.0


Make a bar chart of the output

In [45]:
spotify_df.group_by(
    number_of_artists = pl.col("artist").str.split(",").list.len()
)\
.agg(
    pl.col("streams").mean()
)\
.sort("number_of_artists")\
.plot\
.bar(
    x="number_of_artists",
    y="streams"
)\
.properties(width=500)

### Exercise 3

In [47]:
import numpy as np

pl.Config.set_tbl_rows(4)

np.random.seed(0)
N = 10_000_000
cardinality = 10
# Create a sorted array of id integers
sorted_array = np.sort(np.random.randint(0,cardinality,N))
df = (
    pl.DataFrame(
        {
            "id":[i for i in sorted_array],
            "values":np.random.standard_normal(N)
        }
    )
)
df.head(3)

id,values
i32,f64
0,1.451103
0,0.830491
0,0.82897


Time how long it takes to groupby the `id` column and take the mean of the `values` column without any fast-track algorithm

In [48]:
%%timeit -n1 -r3

df.group_by(
    "id"
)\
.agg(
    pl.col("values").mean()
)

51.7 ms ± 31.3 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


Create a new `DataFrame` called `df_sorted` where we tell Polars the `id` column is sorted

In [49]:
df_sorts = df.with_columns(
    pl.col("id").set_sorted()
)

df_sorts["id"].flags

{'SORTED_ASC': True, 'SORTED_DESC': False}

Time how long it takes to groupby the `id` column and take the mean of the `values` column **with** a fast-track algorithm

In [51]:
%%timeit -n1 -r3

df_sorts.group_by(
    "id"
)\
.agg(
    pl.col("values").mean()
)

14.5 ms ± 4.46 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


> **Note:** Approved that Polars can run faster with the sorted column!