# Select Rows - with Filter and Expression API

In [1]:
import polars as pl

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Selecting rows with a condition using `filter`

In [None]:
df.filter(
    pl.col("Pclass") == 1 # Put conditions here
).select(
    "PassengerId", "Pclass"
).head(3)

PassengerId,Pclass
i64,i64
2,1
4,1
7,1


### Mathematical Operator

In [6]:
df.filter(
    pl.col("Parch") > 1
).select(
    "PassengerId", "Parch", "SibSp"
).head(3)

PassengerId,Parch,SibSp
i64,i64,i64
9,2,0
14,5,1
26,5,1


### Text Operator

In [9]:
df.filter(
    pl.col("Parch").gt(pl.col("SibSp")) # gt stands for greater than
).select(
    "PassengerId", "Parch", "SibSp"
).head(3)

PassengerId,Parch,SibSp
i64,i64,i64
9,2,0
14,5,1
26,5,1


### Keyword Filter

In [12]:
df.filter(
    Parch = 3
).select(
    "PassengerId", "Parch", "SibSp"
).head(3)

PassengerId,Parch,SibSp
i64,i64,i64
87,3,1
438,3,2
737,3,1


## Filtering on a Boolean List

In [17]:
is_first_class = [True if fc == 1 else False for fc in df["Pclass"]]
is_first_class[:3]

[False, True, False]

In [18]:
df.filter(
    is_first_class
).head(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
7,0,1,"""McCarthy, Mr. Timothy J""","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
12,1,1,"""Bonnell, Miss. Elizabeth""","""female""",58.0,0,0,"""113783""",26.55,"""C103""","""S"""
24,1,1,"""Sloper, Mr. William Thompson""","""male""",28.0,0,0,"""113788""",35.5,"""A6""","""S"""


## Filtering on a Boolean Column

In [19]:
df.with_columns(
    less_than_30_boolean = pl.col("Age") < 30
).filter(
    pl.col("less_than_30_boolean")
).select(
    "Pclass", "Name", "Age"
).head(4)

Pclass,Name,Age
i64,str,f64
3,"""Braund, Mr. Owen Harris""",22.0
3,"""Heikkinen, Miss. Laina""",26.0
3,"""Palsson, Master. Gosta Leonard""",2.0
3,"""Johnson, Mrs. Oscar W (Elisabe…",27.0


## Negation of a Condition

In [None]:
df.with_columns(
    less_than_30_boolean = pl.col("Age") < 30
).filter(
    ~pl.col("less_than_30_boolean") # ~ is not in 
).select(
    "Pclass", "Name", "Age"
).head(4)

Pclass,Name,Age
i64,str,f64
1,"""Cumings, Mrs. John Bradley (Fl…",38.0
1,"""Futrelle, Mrs. Jacques Heath (…",35.0
3,"""Allen, Mr. William Henry""",35.0
1,"""McCarthy, Mr. Timothy J""",54.0


## Partitioning a `DataFrame`

In [None]:
df_pclass_dict = (
    df.partition_by(by="Pclass", as_dict=True) # partition_by uses tuple to be the key
)

In [27]:
df_pclass_dict.keys()

dict_keys([(3,), (1,), (2,)])

In [28]:
df_pclass_dict[(1,)].head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""


## Filter in Lazy Mode

In [None]:
pl.scan_csv(
    csv_file
).filter(
    pl.col("Age") > 30
).select(
    "Pclass", "Name", "Age"
)

`.explain()` show the optimized plan, and `filter` operation is behind scanning CSV.

In [31]:
print(pl.scan_csv(
    csv_file
).filter(
    pl.col("Age") > 30
).select(
    "Pclass", "Name", "Age"
).explain())

Csv SCAN [data/titanic.csv]
PROJECT 3/12 COLUMNS
SELECTION: [(col("Age")) > (30.0)]
ESTIMATED ROWS: 971


In [33]:
print(pl.scan_csv(
    csv_file
).filter(
    pl.col("Age") > 30
).select(
    "Pclass", "Name", "Age"
).explain(engine="streaming"))

Csv SCAN [data/titanic.csv]
PROJECT 3/12 COLUMNS
SELECTION: [(col("Age")) > (30.0)]
ESTIMATED ROWS: 971


# Exercises

### Exercise 1 
Select all rows where `Age` is greater than 30

In [37]:
df.filter(
    df["Age"] > 30
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


Select all rows where `Embarked` is equal to "C" - using the keyword approach

In [39]:
df.filter(
    Embarked = "C"
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
10,1,2,"""Nasser, Mrs. Nicholas (Adele A…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""
20,1,3,"""Masselmani, Mrs. Fatima""","""female""",,0,0,"""2649""",7.225,,"""C"""


Select all rows where `Embarked` is equal to "C" - use `pl.col` with the text operator rather than the mathematical operator this time

In [40]:
df.filter(
    pl.col("Embarked").eq("C")
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
10,1,2,"""Nasser, Mrs. Nicholas (Adele A…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""
20,1,3,"""Masselmani, Mrs. Fatima""","""female""",,0,0,"""2649""",7.225,,"""C"""


Select all rows where `Embarked` is **not** equal to "C" 

In [41]:
df.filter(
    ~pl.col("Embarked").eq("C")
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""


### Exercise 2 

First add a row number column

In [43]:
df.with_row_index("row_nr")

row_nr,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
0,1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
1,2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
2,3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
3,4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
4,5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…,…
886,887,0,2,"""Montvila, Rev. Juozas""","""male""",27.0,0,0,"""211536""",13.0,,"""S"""
887,888,1,1,"""Graham, Miss. Margaret Edith""","""female""",19.0,0,0,"""112053""",30.0,"""B42""","""S"""
888,889,0,3,"""Johnston, Miss. Catherine Hele…","""female""",,1,2,"""W./C. 6607""",23.45,,"""S"""
889,890,1,1,"""Behr, Mr. Karl Howell""","""male""",26.0,0,0,"""111369""",30.0,"""C148""","""C"""


Continue by selecting the first 5 rows using `filter` on the row number column

In [48]:
df.with_row_index(
    "row_nr"
).filter(
    pl.col("row_nr") < 5
)

row_nr,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
0,1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
1,2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
2,3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
3,4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
4,5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


### Exercise 3
Partition the `DataFrame` by the `Survived` and `Pclass` columns as a `dict`

[API docs](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.partition_by.html#polars.DataFrame.partition_by)

In [51]:
survived_pclass_dict = df.partition_by(
    ["Survived", "Pclass"], as_dict=True
)

survived_pclass_dict.keys()

dict_keys([(0, 3), (1, 1), (1, 3), (0, 1), (1, 2), (0, 2)])

Return the sub-`DataFrame` with the passengers who did not survive from the third class

In [52]:
survived_pclass_dict[(0, 3)].head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


### Exercise 4
In this exercise we load data from the Spotify charts

In [35]:
spotify_csv = "data/spotify-charts-2017-2021-global-top200.csv.gz"
spotify_df = pl.read_csv(spotify_csv)
spotify_df.head()

title,rank,date,artist,url,region,chart,trend,streams
str,i64,str,str,str,str,str,str,i64
"""Starboy""",1,"""2017-01-01""","""The Weeknd, Daft Punk""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",3135625
"""Closer""",2,"""2017-01-01""","""The Chainsmokers, Halsey""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",3015525
"""Let Me Love You""",3,"""2017-01-01""","""DJ Snake, Justin Bieber""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",2545384
"""Rockabye (feat. Sean Paul & An…",4,"""2017-01-01""","""Clean Bandit""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_DOWN""",2356604
"""One Dance""",5,"""2017-01-01""","""Drake, WizKid, Kyla""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",2259887


Filter the `DataFrame` to find all rows with artist Post Malone

In [54]:
spotify_df.filter(
    artist = "Post Malone"
)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,str,str,str,str,str,str,i64
"""White Iverson""",196,"""2017-01-01""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""NEW_ENTRY""",332756
"""White Iverson""",188,"""2017-01-02""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",343936
"""Psycho (feat. Ty Dolla $ign)""",2,"""2018-03-01""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",5999224
"""I Fall Apart""",22,"""2018-03-01""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",2003396
"""Candy Paint""",64,"""2018-03-01""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",1065141
…,…,…,…,…,…,…,…,…
"""White Iverson""",184,"""2018-01-30""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",590653
"""I Fall Apart""",18,"""2018-01-31""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",2034143
"""Candy Paint""",55,"""2018-01-31""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",1247324
"""Go Flex""",140,"""2018-01-31""","""Post Malone""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",704766
