# `Filter` in Lazy Mode

In [1]:
import polars as pl

In [2]:
csv_file = "data/titanic.csv"

## Lazy Mode

In [5]:
df = pl.scan_csv(csv_file)
df

## Filter in Lazy Mode

Applying a `filter` in lazy mode, a `FILTER` line is added to the naive query plan.

In [8]:
df = df.filter(pl.col("Age") > 30)
print(df.explain(optimized=True))

Csv SCAN [data/titanic.csv]
PROJECT */12 COLUMNS
SELECTION: [(col("Age")) > (30.0)]
ESTIMATED ROWS: 971


## Multiple Conditions

### Apply `AND` Conditions

Chaining multiple calls to `filter`.

In [None]:
df = pl.scan_csv(
    csv_file
).filter(
    pl.col("Pclass") == 1
).filter(
    pl.col("Age") > 70
)

print(df.explain(optimized=True))

Csv SCAN [data/titanic.csv]
PROJECT */12 COLUMNS
SELECTION: [([(col("Age")) > (70.0)]) & ([(col("Pclass")) == (1)])]
ESTIMATED ROWS: 971


In eager mode chaining is inefficient.

It is better to combine everything into a `single` condition.

In [12]:
pl.read_csv(
    csv_file
).filter(
    (pl.col("Age") > 70) & (pl.col("Pclass") == 1)
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
97,0,1,"""Goldschmidt, Mr. George B""","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""
494,0,1,"""Artagaveytia, Mr. Ramon""","""male""",71.0,0,0,"""PC 17609""",49.5042,,"""C"""


The less verbose way.

In [14]:
pl.read_csv(
    csv_file
).filter(
    (pl.col("Age") > 70),
    (pl.col("Pclass") == 1)
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
97,0,1,"""Goldschmidt, Mr. George B""","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""
494,0,1,"""Artagaveytia, Mr. Ramon""","""male""",71.0,0,0,"""PC 17609""",49.5042,,"""C"""


Keyword way.

In [15]:
pl.read_csv(
    csv_file
).filter(
    Age = 70,
    Pclass = 1
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
746,0,1,"""Crosby, Capt. Edward Gifford""","""male""",70.0,1,1,"""WE/P 5735""",71.0,"""B22""","""S"""


### Apply `pl.all_horizontal`

When we want to apply an `AND` condition on many columns.

In [17]:
pl.read_csv(
    csv_file
).filter(
    pl.all_horizontal(
        pl.all().is_not_null() # all() means all_columns
    )
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
7,0,1,"""McCarthy, Mr. Timothy J""","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
11,1,3,"""Sandstrom, Miss. Marguerite Ru…","""female""",4.0,1,1,"""PP 9549""",16.7,"""G6""","""S"""
12,1,1,"""Bonnell, Miss. Elizabeth""","""female""",58.0,0,0,"""113783""",26.55,"""C103""","""S"""
…,…,…,…,…,…,…,…,…,…,…,…
872,1,1,"""Beckwith, Mrs. Richard Leonard…","""female""",47.0,1,1,"""11751""",52.5542,"""D35""","""S"""
873,0,1,"""Carlsson, Mr. Frans Olof""","""male""",33.0,0,0,"""695""",5.0,"""B51 B53 B55""","""S"""
880,1,1,"""Potter, Mrs. Thomas Jr (Lily A…","""female""",56.0,0,1,"""11767""",83.1583,"""C50""","""C"""
888,1,1,"""Graham, Miss. Margaret Edith""","""female""",19.0,0,0,"""112053""",30.0,"""B42""","""S"""


### Apply `AND` condition on a range

Use `is_between(start, end, closed)`

closed define the range interval.

In [21]:
pl.read_csv(
    csv_file
).filter(
    pl.col("Age").is_between(10, 13)
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
60,0,3,"""Goodwin, Master. William Frede…","""male""",11.0,5,2,"""CA 2144""",46.9,,"""S"""
126,1,3,"""Nicola-Yarred, Master. Elias""","""male""",12.0,1,0,"""2651""",11.2417,,"""C"""
420,0,3,"""Van Impe, Miss. Catharina""","""female""",10.0,0,2,"""345773""",24.15,,"""S"""
447,1,2,"""Mellinger, Miss. Madeleine Vio…","""female""",13.0,0,1,"""250644""",19.5,,"""S"""
543,0,3,"""Andersson, Miss. Sigrid Elisab…","""female""",11.0,4,2,"""347082""",31.275,,"""S"""
732,0,3,"""Hassan, Mr. Houssein G N""","""male""",11.0,0,0,"""2699""",18.7875,,"""C"""
781,1,3,"""Ayoub, Miss. Banoura""","""female""",13.0,0,0,"""2687""",7.2292,,"""C"""
803,1,1,"""Carter, Master. William Thornt…","""male""",11.0,1,2,"""113760""",120.0,"""B96 B98""","""S"""
820,0,3,"""Skoog, Master. Karl Thorsten""","""male""",10.0,3,2,"""347088""",27.9,,"""S"""


### Apply `OR` Conditions

Pipe operator `|`

In [22]:
pl.read_csv(
    csv_file
).filter(
    (pl.col("Age") > 70) | (pl.col("Pclass") == 1)
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""


`is_in(list)` method

In [None]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


In [23]:
pl.read_csv(
    csv_file
).filter(
    pl.col("Pclass").is_in([2, 3])
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
8,0,3,"""Palsson, Master. Gosta Leonard""","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
885,0,3,"""Sutehall, Mr. Henry Jr""","""male""",25.0,0,0,"""SOTON/OQ 392076""",7.05,,"""S"""
886,0,3,"""Rice, Mrs. William (Margaret N…","""female""",39.0,0,5,"""382652""",29.125,,"""Q"""
887,0,2,"""Montvila, Rev. Juozas""","""male""",27.0,0,0,"""211536""",13.0,,"""S"""
889,0,3,"""Johnston, Miss. Catherine Hele…","""female""",,1,2,"""W./C. 6607""",23.45,,"""S"""


## Exercises

### Exercise 1 
Filter the `DataFrame` to find rows where `Age` is between 30 and 50 (including the lower bound) and the passenger is in 2nd class. 

Do this in eager mode in a single pass through the `DataFrame`

In [29]:
pl.read_csv(csv_file).filter(
    pl.col("Age").is_between(30, 50, closed="left") & (pl.col("Pclass") == 2)
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
21,0,2,"""Fynney, Mr. Joseph J""","""male""",35.0,0,0,"""239865""",26.0,,"""S"""
22,1,2,"""Beesley, Mr. Lawrence""","""male""",34.0,0,0,"""248698""",13.0,"""D56""","""S"""
71,0,2,"""Jenkin, Mr. Stephen Curnow""","""male""",32.0,0,0,"""C.A. 33111""",10.5,,"""S"""
99,1,2,"""Doling, Mrs. John T (Ada Julia…","""female""",34.0,0,1,"""231919""",23.0,,"""S"""
100,0,2,"""Kantor, Mr. Sinai""","""male""",34.0,1,0,"""244367""",26.0,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
809,0,2,"""Meyer, Mr. August""","""male""",39.0,0,0,"""248723""",13.0,,"""S"""
813,0,2,"""Slemen, Mr. Richard James""","""male""",35.0,0,0,"""28206""",10.5,,"""S"""
818,0,2,"""Mallet, Mr. Albert""","""male""",31.0,1,1,"""S.C./PARIS 2079""",37.0042,,"""C"""
855,0,2,"""Carter, Mrs. Ernest Courtenay …","""female""",44.0,1,0,"""244252""",26.0,,"""S"""


Do this again combining the range condition with the keyword approach for the 2nd class condition - does the order you pass the conditions matter?

> **Note:** keyword arguments must be the last on the list.

In [35]:
pl.read_csv(csv_file).filter(
    pl.col("Age").is_between(30, 50, closed="left"),
    Pclass=2,
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
21,0,2,"""Fynney, Mr. Joseph J""","""male""",35.0,0,0,"""239865""",26.0,,"""S"""
22,1,2,"""Beesley, Mr. Lawrence""","""male""",34.0,0,0,"""248698""",13.0,"""D56""","""S"""
71,0,2,"""Jenkin, Mr. Stephen Curnow""","""male""",32.0,0,0,"""C.A. 33111""",10.5,,"""S"""
99,1,2,"""Doling, Mrs. John T (Ada Julia…","""female""",34.0,0,1,"""231919""",23.0,,"""S"""
100,0,2,"""Kantor, Mr. Sinai""","""male""",34.0,1,0,"""244367""",26.0,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
809,0,2,"""Meyer, Mr. August""","""male""",39.0,0,0,"""248723""",13.0,,"""S"""
813,0,2,"""Slemen, Mr. Richard James""","""male""",35.0,0,0,"""28206""",10.5,,"""S"""
818,0,2,"""Mallet, Mr. Albert""","""male""",31.0,1,1,"""S.C./PARIS 2079""",37.0042,,"""C"""
855,0,2,"""Carter, Mrs. Ernest Courtenay …","""female""",44.0,1,0,"""244252""",26.0,,"""S"""


### Exercise 2
Return all the rows of the `DataFrame` where at least one column on the row is `null` (excluding the `Cabin` column with many `null` values)

In [36]:
pl.read_csv(csv_file).drop(
    "Cabin"
).filter(
    pl.any_horizontal(
        pl.all().is_null()
    )
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,"""Q"""
18,1,2,"""Williams, Mr. Charles Eugene""","""male""",,0,0,"""244373""",13.0,"""S"""
20,1,3,"""Masselmani, Mrs. Fatima""","""female""",,0,0,"""2649""",7.225,"""C"""
27,0,3,"""Emir, Mr. Farred Chehab""","""male""",,0,0,"""2631""",7.225,"""C"""
29,1,3,"""O'Dwyer, Miss. Ellen ""Nellie""""","""female""",,0,0,"""330959""",7.8792,"""Q"""
…,…,…,…,…,…,…,…,…,…,…
860,0,3,"""Razi, Mr. Raihed""","""male""",,0,0,"""2629""",7.2292,"""C"""
864,0,3,"""Sage, Miss. Dorothy Edith ""Dol…","""female""",,8,2,"""CA. 2343""",69.55,"""S"""
869,0,3,"""van Melkebeke, Mr. Philemon""","""male""",,0,0,"""345777""",9.5,"""S"""
879,0,3,"""Laleff, Mr. Kristo""","""male""",,0,0,"""349217""",7.8958,"""S"""


### Exercise 3
Create a `DataFrame` where the passengers got on in Cork ("C") or Southampton ("S") using the pipe operator

In [38]:
pl.read_csv(csv_file).filter(
    (pl.col("Embarked") == "C") | (pl.col("Embarked") == "S")
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


Do this again using the `is_in` approach

In [40]:
pl.read_csv(csv_file).filter(
    pl.col("Embarked").is_in(["C", "S"])
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


### Exercise 4
Load the Spotify CSV data into a `DataFrame`

In [41]:
spotify_csv = "data/spotify-charts-2017-2021-global-top200.csv.gz"
spotify_df = pl.read_csv(spotify_csv)
spotify_df.head()

title,rank,date,artist,url,region,chart,trend,streams
str,i64,str,str,str,str,str,str,i64
"""Starboy""",1,"""2017-01-01""","""The Weeknd, Daft Punk""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",3135625
"""Closer""",2,"""2017-01-01""","""The Chainsmokers, Halsey""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",3015525
"""Let Me Love You""",3,"""2017-01-01""","""DJ Snake, Justin Bieber""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",2545384
"""Rockabye (feat. Sean Paul & An…",4,"""2017-01-01""","""Clean Bandit""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_DOWN""",2356604
"""One Dance""",5,"""2017-01-01""","""Drake, WizKid, Kyla""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",2259887


Find all rows where the number of streams is greater than 10 million and the trend is "NEW_ENTRY"  

In [42]:
spotify_df.filter(
    pl.col("streams") > 10000000,
    pl.col("trend") == "NEW_ENTRY"
)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,str,str,str,str,str,str,i64
"""I Don't Care (with Justin Bieb…",1,"""2019-05-10""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""NEW_ENTRY""",10977389
"""Butter""",2,"""2021-05-21""","""BTS""","""https://open.spotify.com/track…","""Global""","""top200""","""NEW_ENTRY""",11042335
"""Girls Want Girls (with Lil Bab…",1,"""2021-09-03""","""Drake""","""https://open.spotify.com/track…","""Global""","""top200""","""NEW_ENTRY""",12384750
"""Champagne Poetry""",2,"""2021-09-03""","""Drake""","""https://open.spotify.com/track…","""Global""","""top200""","""NEW_ENTRY""",11696783
"""Fair Trade (with Travis Scott)""",3,"""2021-09-03""","""Drake""","""https://open.spotify.com/track…","""Global""","""top200""","""NEW_ENTRY""",11642541
"""Papi’s Home""",5,"""2021-09-03""","""Drake""","""https://open.spotify.com/track…","""Global""","""top200""","""NEW_ENTRY""",10029292


Find the rows where the artist is either Drake or Ed Sheeran and the rank is less than (better than) 5

In [43]:
spotify_df.filter(
    pl.col("artist").is_in(["Drake", "Ed Sheeran"]),
    pl.col("rank") < 5
)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,str,str,str,str,str,str,i64
"""God's Plan""",1,"""2018-03-01""","""Drake""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",7572795
"""Shape of You""",1,"""2017-02-01""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",7549041
"""God's Plan""",1,"""2018-03-02""","""Drake""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",7817732
"""Shape of You""",1,"""2017-02-02""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",7602214
"""Shape of You""",1,"""2017-02-03""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",7881255
…,…,…,…,…,…,…,…,…
"""Shivers""",4,"""2021-10-30""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",4995236
"""Shivers""",4,"""2021-10-31""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",4195481
"""God's Plan""",1,"""2018-01-29""","""Drake""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",7272380
"""God's Plan""",1,"""2018-01-30""","""Drake""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",7468471
