# Using expressions on List columns

In [1]:
import polars as pl

In [2]:
df = (
    pl.DataFrame(
        {
            'values':[ 
                [0,1], 
                [2,3,4],
                [4,5,6,7,8]
            ],
        }
    )
)
df

values
list[i64]
"[0, 1]"
"[2, 3, 4]"
"[4, 5, … 8]"


## The list expression namespace
Polars has a `.list` [namespace](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/list.html) for expressions that work on `pl.List` columns


## Selecting data in lists

In [3]:
df\
.with_columns(
    [
        pl.col("values").list.first().alias("first"),
        pl.col("values").list.last().alias("last"),
        pl.col("values").list.head(2).alias("head"),
        pl.col("values").list.tail(2).alias("tail"),
        pl.col("values").list.slice(1,2).alias("slice"),
    ]
)

values,first,last,head,tail,slice
list[i64],i64,i64,list[i64],list[i64],list[i64]
"[0, 1]",0,1,"[0, 1]","[0, 1]",[1]
"[2, 3, 4]",2,4,"[2, 3]","[3, 4]","[3, 4]"
"[4, 5, … 8]",4,8,"[4, 5]","[7, 8]","[5, 6]"


To get a specific row we use `list.get` to select a value by a position `index` in each list

In [4]:
df.with_columns(
    [
        pl.col("values").list.get(0).alias("first"),
        pl.col("values").list.get(1).alias("second"),
        pl.col("values").list.get(-1).alias("last"),
    ]
)

values,first,second,last
list[i64],i64,i64,i64
"[0, 1]",0,1,1
"[2, 3, 4]",2,3,4
"[4, 5, … 8]",4,5,8


Check whether each list contains a value with `list.contains`

In [6]:
df\
.with_columns(
    [
        pl.col("values").list.contains(i).alias(str(i)) for i in range(3)
    ]
)

values,0,1,2
list[i64],bool,bool,bool
"[0, 1]",True,True,False
"[2, 3, 4]",False,False,True
"[4, 5, … 8]",False,False,False


In [7]:
df\
.with_columns(
    four = pl.lit(4)
)

values,four
list[i64],i32
"[0, 1]",4
"[2, 3, 4]",4
"[4, 5, … 8]",4


In [8]:
df\
.with_columns(
    four = pl.lit(4)
)\
.with_columns(
    pl.col("values").list.contains(pl.col("four")).alias("has_four")
)

values,four,has_four
list[i64],i32,bool
"[0, 1]",4,False
"[2, 3, 4]",4,True
"[4, 5, … 8]",4,True


In [9]:
df2 = (
    pl.DataFrame(
        {
            'values':[ 
                [0,1,0], 
                [2,3],
                [4,5,6,7,8]
            ],
            'values_2':[ 
                [0], 
                [2,3,4],
                [4,5,9]
            ],

        }
    )
)
df2

values,values_2
list[i64],list[i64]
"[0, 1, 0]",[0]
"[2, 3]","[2, 3, 4]"
"[4, 5, … 8]","[4, 5, 9]"


Find all unique values in an list with `list.unique`

In [10]:
df2\
.select(
    "values",
    pl.col("values").list.unique().alias("unique")
)

values,unique
list[i64],list[i64]
"[0, 1, 0]","[0, 1]"
"[2, 3]","[2, 3]"
"[4, 5, … 8]","[4, 5, … 8]"


- `set_intersection` gets the common values between lists
- `set_difference` gets values that are in the left list but not in the right list
- `set_symmetric_difference` gets values that are not in both lists
- `set_union` gets the unique values from both lists

In [11]:
df2

values,values_2
list[i64],list[i64]
"[0, 1, 0]",[0]
"[2, 3]","[2, 3, 4]"
"[4, 5, … 8]","[4, 5, 9]"


In [13]:
df2\
.with_columns(
    pl.col("values").list.set_intersection(pl.col("values_2")).alias("intersection"),
    pl.col("values").list.set_difference(pl.col("values_2")).alias("difference"),
    pl.col("values").list.set_symmetric_difference(pl.col("values_2")).alias("symmetric_difference"),
    pl.col("values").list.set_union(pl.col("values_2")).alias("union"),
)

values,values_2,intersection,difference,symmetric_difference,union
list[i64],list[i64],list[i64],list[i64],list[i64],list[i64]
"[0, 1, 0]",[0],[0],[1],[1],"[0, 1]"
"[2, 3]","[2, 3, 4]","[2, 3]",[],[4],"[2, 3, 4]"
"[4, 5, … 8]","[4, 5, 9]","[4, 5]","[8, 7, 6]","[6, 7, … 9]","[4, 5, … 9]"


### Re-ordering values in each list

- `reverse` reverses the order of the list
- `sort` sorts each list
- `shift` moves values in each list so the first values are `null`

In [14]:
df

values
list[i64]
"[0, 1]"
"[2, 3, 4]"
"[4, 5, … 8]"


In [17]:
df\
.with_columns(
    pl.col("values").list.reverse().alias("reverse"),
    pl.col("values").list.sort().alias("sort"),
    pl.col("values").list.shift(1).alias("shift"),
)

values,reverse,sort,shift
list[i64],list[i64],list[i64],list[i64]
"[0, 1]","[1, 0]","[0, 1]","[null, 0]"
"[2, 3, 4]","[4, 3, 2]","[2, 3, 4]","[null, 2, 3]"
"[4, 5, … 8]","[8, 7, … 4]","[4, 5, … 8]","[null, 4, … 7]"


### List aggregations

In [18]:
df\
.with_columns(
    pl.col("values").list.len().alias("lengths"),
    pl.col("values").list.min().alias("min"),
    pl.col("values").list.mean().alias("mean"),
    pl.col("values").list.max().alias("max"),
)

values,lengths,min,mean,max
list[i64],u32,i64,f64,i64
"[0, 1]",2,0,0.5,1
"[2, 3, 4]",3,2,3.0,4
"[4, 5, … 8]",5,4,6.0,8


## Calling expressions on each list
Each row in a `pl.List` column is a `Series`.

In [None]:
pl.DataFrame(
    {
        "values": [[0, 1], [4, 3, 2]],
    }
)\
.with_columns(
    pl.col("values").list.eval( # eval run any polar's expression against the list's elements
        pl.element().rank(method="ordinal")
    ).alias("eval")
)

values,eval
list[i64],list[u32]
"[0, 1]","[1, 2]"
"[4, 3, 2]","[3, 2, 1]"


Call `element()` without any functions following.

In [None]:
pl.DataFrame(
    {
        "values": [[0, 1], [4, 3, 2]],
    }
)\
.with_columns(
    pl.col("values").list.eval(
        pl.element()
    ).alias("eval")
)

values,eval
list[i64],list[i64]
"[0, 1]","[0, 1]"
"[4, 3, 2]","[4, 3, 2]"


`element()` with `filter()`

In [21]:
pl.DataFrame(
    {
        "values": [[0,None,1], [2,3,None]],
    }
)\
.with_columns(
    pl.col("values").list.eval(
        pl.element().filter(
            pl.element().is_not_null()
        )
    ).alias("eval")
)

values,eval
list[i64],list[i64]
"[0, null, 1]","[0, 1]"
"[2, 3, null]","[2, 3]"


## Exercises

### Exercise 1
We need to parse the following address strings to get columns with the:
- number
- street
- city
- state
- zipcode

In [24]:
pl.Config.set_fmt_str_lengths(150)
addresses = [
    '93 NORTH 9TH STREET, BROOKLYN NY 11211',
    '380 WESTMINSTER ST, PROVIDENCE RI 02903',
    '177 MAIN STREET, LITTLETON NH 03561'
]

df = (
    pl.DataFrame(
        {"address":addresses}
    )
)
df

address
str
"""93 NORTH 9TH STREET, BROOKLYN NY 11211"""
"""380 WESTMINSTER ST, PROVIDENCE RI 02903"""
"""177 MAIN STREET, LITTLETON NH 03561"""


Add a column called `split` with the string split by whitespace (using `str.split`) into a list column

In [27]:
df\
.with_columns(
    pl.col("address").str.split(" ").alias("split")
)

address,split
str,list[str]
"""93 NORTH 9TH STREET, BROOKLYN NY 11211""","[""93"", ""NORTH"", … ""11211""]"
"""380 WESTMINSTER ST, PROVIDENCE RI 02903""","[""380"", ""WESTMINSTER"", … ""02903""]"
"""177 MAIN STREET, LITTLETON NH 03561""","[""177"", ""MAIN"", … ""03561""]"


In an additional `with_column` statement add a 32-bit integer column called `number` using the `first` element of each list

In [28]:
df\
.with_columns(
    pl.col("address").str.split(" ").alias("split")
)\
.with_columns(
    pl.col("split").list.first().cast(pl.Int32).alias("number")
)

address,split,number
str,list[str],i32
"""93 NORTH 9TH STREET, BROOKLYN NY 11211""","[""93"", ""NORTH"", … ""11211""]",93
"""380 WESTMINSTER ST, PROVIDENCE RI 02903""","[""380"", ""WESTMINSTER"", … ""02903""]",380
"""177 MAIN STREET, LITTLETON NH 03561""","[""177"", ""MAIN"", … ""03561""]",177


The street component of the address runs from the second element of the list to the element of the list that contains a comma.

Add a list column called `contains_comma` where we check if each element in the lists in `split` contain a comma. 

Use `eval` to run the `str.contains` expression on each element in the list

In [29]:
df\
.with_columns(
    pl.col("address").str.split(" ").alias("split")
)\
.with_columns(
    pl.col("split").list.first().cast(pl.Int32).alias("number"),
    pl.col("split").list.eval(
        pl.element().str.contains(",")
    ).alias("contains_comma")
)

address,split,number,contains_comma
str,list[str],i32,list[bool]
"""93 NORTH 9TH STREET, BROOKLYN NY 11211""","[""93"", ""NORTH"", … ""11211""]",93,"[false, false, … false]"
"""380 WESTMINSTER ST, PROVIDENCE RI 02903""","[""380"", ""WESTMINSTER"", … ""02903""]",380,"[false, false, … false]"
"""177 MAIN STREET, LITTLETON NH 03561""","[""177"", ""MAIN"", … ""03561""]",177,"[false, false, … false]"


With a new call to `with_column` slice each list in `split` from the second element to the index of the element that contains a comma.

Hint 1: `list.arg_max` expression that finds the index of the largest value in an list.

In [32]:
df\
.with_columns(
    pl.col("address").str.split(" ").alias("split")
)\
.with_columns(
    pl.col("split").list.first().cast(pl.Int32).alias("number"),
    pl.col("split").list.eval(
        pl.element().str.contains(",")
    ).alias("contains_comma")
)\
.with_columns(
    pl.col("split").list.slice(1, pl.col("contains_comma").list.arg_max()).alias("street")
)

address,split,number,contains_comma,street
str,list[str],i32,list[bool],list[str]
"""93 NORTH 9TH STREET, BROOKLYN NY 11211""","[""93"", ""NORTH"", … ""11211""]",93,"[false, false, … false]","[""NORTH"", ""9TH"", ""STREET,""]"
"""380 WESTMINSTER ST, PROVIDENCE RI 02903""","[""380"", ""WESTMINSTER"", … ""02903""]",380,"[false, false, … false]","[""WESTMINSTER"", ""ST,""]"
"""177 MAIN STREET, LITTLETON NH 03561""","[""177"", ""MAIN"", … ""03561""]",177,"[false, false, … false]","[""MAIN"", ""STREET,""]"


Polars has a similar method called `list.join`.

Join the string lists in `street` using `list.join` (with a " " separating the strings)

In [33]:
df\
.with_columns(
    pl.col("address").str.split(" ").alias("split")
)\
.with_columns(
    pl.col("split").list.first().cast(pl.Int32).alias("number"),
    pl.col("split").list.eval(
        pl.element().str.contains(",")
    ).alias("contains_comma")
)\
.with_columns(
    pl.col("split").list.slice(1, pl.col("contains_comma").list.arg_max()).list.join(separator=" ").alias("street")
)

address,split,number,contains_comma,street
str,list[str],i32,list[bool],str
"""93 NORTH 9TH STREET, BROOKLYN NY 11211""","[""93"", ""NORTH"", … ""11211""]",93,"[false, false, … false]","""NORTH 9TH STREET,"""
"""380 WESTMINSTER ST, PROVIDENCE RI 02903""","[""380"", ""WESTMINSTER"", … ""02903""]",380,"[false, false, … false]","""WESTMINSTER ST,"""
"""177 MAIN STREET, LITTLETON NH 03561""","[""177"", ""MAIN"", … ""03561""]",177,"[false, false, … false]","""MAIN STREET,"""


Extract the `city` from `split` by slicing. 

The slice should start from the `arg_max` value in `contains_command` and have a length of 1

In [36]:
df\
.with_columns(
    pl.col("address").str.split(" ").alias("split")
)\
.with_columns(
    pl.col("split").list.first().cast(pl.Int32).alias("number"),
    pl.col("split").list.eval(
        pl.element().str.contains(",")
    ).alias("contains_comma")
)\
.with_columns(
    pl.col("split").list.slice(1, pl.col("contains_comma").list.arg_max()).list.join(separator=" ").alias("street")
)\
.with_columns(
    pl.col("split").list.slice(pl.col("contains_comma").list.arg_max() + 1, 1).alias("city")
)

address,split,number,contains_comma,street,city
str,list[str],i32,list[bool],str,list[str]
"""93 NORTH 9TH STREET, BROOKLYN NY 11211""","[""93"", ""NORTH"", … ""11211""]",93,"[false, false, … false]","""NORTH 9TH STREET,""","[""BROOKLYN""]"
"""380 WESTMINSTER ST, PROVIDENCE RI 02903""","[""380"", ""WESTMINSTER"", … ""02903""]",380,"[false, false, … false]","""WESTMINSTER ST,""","[""PROVIDENCE""]"
"""177 MAIN STREET, LITTLETON NH 03561""","[""177"", ""MAIN"", … ""03561""]",177,"[false, false, … false]","""MAIN STREET,""","[""LITTLETON""]"


Get the `zipcode` as the last element in `split`

In [37]:
df\
.with_columns(
    pl.col("address").str.split(" ").alias("split")
)\
.with_columns(
    pl.col("split").list.first().cast(pl.Int32).alias("number"),
    pl.col("split").list.eval(
        pl.element().str.contains(",")
    ).alias("contains_comma")
)\
.with_columns(
    pl.col("split").list.slice(1, pl.col("contains_comma").list.arg_max()).list.join(separator=" ").alias("street")
)\
.with_columns(
    pl.col("split").list.slice(pl.col("contains_comma").list.arg_max() + 1, 1).alias("city"),
    pl.col("split").list.last().cast(pl.Int32).alias("zipcode")
)

address,split,number,contains_comma,street,city,zipcode
str,list[str],i32,list[bool],str,list[str],i32
"""93 NORTH 9TH STREET, BROOKLYN NY 11211""","[""93"", ""NORTH"", … ""11211""]",93,"[false, false, … false]","""NORTH 9TH STREET,""","[""BROOKLYN""]",11211
"""380 WESTMINSTER ST, PROVIDENCE RI 02903""","[""380"", ""WESTMINSTER"", … ""02903""]",380,"[false, false, … false]","""WESTMINSTER ST,""","[""PROVIDENCE""]",2903
"""177 MAIN STREET, LITTLETON NH 03561""","[""177"", ""MAIN"", … ""03561""]",177,"[false, false, … false]","""MAIN STREET,""","[""LITTLETON""]",3561


### Exercise 2

In [25]:
pl.Config.set_fmt_str_lengths(100)

pl.Config.set_tbl_rows(10)

spotify_csv = "data/spotify-charts-2017-2021-global-top200.csv.gz"
spotify_df = pl.read_csv(spotify_csv,try_parse_dates=True)
spotify_df.head(3)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,date,str,str,str,str,str,i64
"""Starboy""",1,2017-01-01,"""The Weeknd, Daft Punk""","""https://open.spotify.com/track/5aAx2yezTd8zXrkmtKl66Z""","""Global""","""top200""","""SAME_POSITION""",3135625
"""Closer""",2,2017-01-01,"""The Chainsmokers, Halsey""","""https://open.spotify.com/track/7BKLCZ1jbUBVqRi2FVlTVw""","""Global""","""top200""","""SAME_POSITION""",3015525
"""Let Me Love You""",3,2017-01-01,"""DJ Snake, Justin Bieber""","""https://open.spotify.com/track/4pdPtRcBmOSQDlJ3Fk945m""","""Global""","""top200""","""MOVE_UP""",2545384


- Keep one row for each unique track (with uniqueness defined by the title and artist columns)
- Create a list column called `artists` by splitting the `artist` column

In [43]:
spotify_df\
.unique(["title", "artist"])\
.with_columns(
    pl.col("artist").str.split(" ").alias("artists")
)\
.select(
    "title","rank","date","artist","artists","streams"
).head(3)

title,rank,date,artist,artists,streams
str,i64,date,str,list[str],i64
"""Ich darf das""",150,2021-05-14,"""Shirin David""","[""Shirin"", ""David""]",888942
"""Another One Bites The Dust - Remastered 2011""",197,2020-05-02,"""Queen""","[""Queen""]",635158
"""Finesse""",37,2018-07-01,"""Drake""","[""Drake""]",1952491


Continue by finding the 10 tracks with the most number of artists

In [44]:
spotify_df\
.unique(["title", "artist"])\
.with_columns(
    pl.col("artist").str.split(" ").alias("artists")
)\
.select(
    "title","rank","date","artist","artists","streams"
)\
.top_k(
    k=10,
    by=pl.col("artists").list.len()
)

title,rank,date,artist,artists,streams
str,i64,date,str,list[str],i64
"""Susamam""",139,2019-09-06,"""Şanışer, Kamufle, Mert Şenel, Mirac, Aga B, Defkhan, Aspova, Yeis Sensura, Sehabe, Deniz Tekin, Ozbi…","[""Şanışer,"", ""Kamufle,"", … ""Fuat""]",825680
"""Pa' La Cultura""",151,2020-08-07,"""David Guetta, HUMAN(X), Sofía Reyes, Abraham Mateo, De La Ghetto, Manuel Turizo, Zion & Lennox, Lalo…","[""David"", ""Guetta,"", … ""Maejor""]",808568
"""Instagram""",99,2019-07-05,"""Dimitri Vegas & Like Mike, David Guetta, Daddy Yankee, Afro Bros, Natti Natasha, Dimitri Vegas, Like…","[""Dimitri"", ""Vegas"", … ""Mike""]",1031732
"""Quizás""",75,2019-10-01,"""Rich Music LTD, Dalex, Sech, Justin Quiles, Wisin, Zion, Dímelo Flow, Lenny Tavárez, Feid""","[""Rich"", ""Music"", … ""Feid""]",1042822
"""Enséñame""",192,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", … ""Herrera""]",737990
"""Aún Hay Algo""",195,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", … ""Herrera""]",735311
"""Poblado - Remix""",37,2021-07-01,"""J Balvin, KAROL G, Nicky Jam, Crissin, Totoy El Frio, Natan & Shander""","[""J"", ""Balvin,"", … ""Shander""]",1817017
"""Rebelde""",130,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", … ""Herrera""]",882187
"""Solo Quédate En Silencio""",124,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", … ""Herrera""]",906122
"""Nuestro Amor""",138,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", … ""Herrera""]",845060


Apply a `pl.Config` setting to ensure we can read all of the list elements and then display the results again

In [47]:
pl.Config.set_fmt_table_cell_list_len(50)

polars.config.Config

In [48]:
spotify_df\
.unique(["title", "artist"])\
.with_columns(
    pl.col("artist").str.split(" ").alias("artists")
)\
.select(
    "title","rank","date","artist","artists","streams"
)\
.top_k(
    k=10,
    by=pl.col("artists").list.len()
)

title,rank,date,artist,artists,streams
str,i64,date,str,list[str],i64
"""Susamam""",139,2019-09-06,"""Şanışer, Kamufle, Mert Şenel, Mirac, Aga B, Defkhan, Aspova, Yeis Sensura, Sehabe, Deniz Tekin, Ozbi…","[""Şanışer,"", ""Kamufle,"", ""Mert"", ""Şenel,"", ""Mirac,"", ""Aga"", ""B,"", ""Defkhan,"", ""Aspova,"", ""Yeis"", ""Sensura,"", ""Sehabe,"", ""Deniz"", ""Tekin,"", ""Ozbi,"", ""Sokrat"", ""St,"", ""Tahribad-ı"", ""İsyan,"", ""Beta,"", ""Server"", ""Uraz,"", ""Hayki,"", ""Ados,"", ""Fuat""]",825680
"""Pa' La Cultura""",151,2020-08-07,"""David Guetta, HUMAN(X), Sofía Reyes, Abraham Mateo, De La Ghetto, Manuel Turizo, Zion & Lennox, Lalo…","[""David"", ""Guetta,"", ""HUMAN(X),"", ""Sofía"", ""Reyes,"", ""Abraham"", ""Mateo,"", ""De"", ""La"", ""Ghetto,"", ""Manuel"", ""Turizo,"", ""Zion"", ""&"", ""Lennox,"", ""Lalo"", ""Ebratt,"", ""Thalia,"", ""Maejor""]",808568
"""Instagram""",99,2019-07-05,"""Dimitri Vegas & Like Mike, David Guetta, Daddy Yankee, Afro Bros, Natti Natasha, Dimitri Vegas, Like…","[""Dimitri"", ""Vegas"", ""&"", ""Like"", ""Mike,"", ""David"", ""Guetta,"", ""Daddy"", ""Yankee,"", ""Afro"", ""Bros,"", ""Natti"", ""Natasha,"", ""Dimitri"", ""Vegas,"", ""Like"", ""Mike""]",1031732
"""Quizás""",75,2019-10-01,"""Rich Music LTD, Dalex, Sech, Justin Quiles, Wisin, Zion, Dímelo Flow, Lenny Tavárez, Feid""","[""Rich"", ""Music"", ""LTD,"", ""Dalex,"", ""Sech,"", ""Justin"", ""Quiles,"", ""Wisin,"", ""Zion,"", ""Dímelo"", ""Flow,"", ""Lenny"", ""Tavárez,"", ""Feid""]",1042822
"""Nuestro Amor""",138,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", ""Dulce"", ""María,"", ""Maite"", ""Perroni,"", ""Christian"", ""Chávez,"", ""Christopher"", ""von"", ""Uckermann,"", ""Alfonso"", ""Herrera""]",845060
"""Sálvame""",110,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", ""Dulce"", ""María,"", ""Maite"", ""Perroni,"", ""Christian"", ""Chávez,"", ""Christopher"", ""von"", ""Uckermann,"", ""Alfonso"", ""Herrera""]",973935
"""Travesuras - Remix""",85,2021-04-01,"""Nio Garcia, Casper Magico, Ozuna, Wisin & Yandel, Myke Towers, Flow La Movie""","[""Nio"", ""Garcia,"", ""Casper"", ""Magico,"", ""Ozuna,"", ""Wisin"", ""&"", ""Yandel,"", ""Myke"", ""Towers,"", ""Flow"", ""La"", ""Movie""]",1057549
"""Enséñame""",192,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", ""Dulce"", ""María,"", ""Maite"", ""Perroni,"", ""Christian"", ""Chávez,"", ""Christopher"", ""von"", ""Uckermann,"", ""Alfonso"", ""Herrera""]",737990
"""Un Poco De Tu Amor""",185,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", ""Dulce"", ""María,"", ""Maite"", ""Perroni,"", ""Christian"", ""Chávez,"", ""Christopher"", ""von"", ""Uckermann,"", ""Alfonso"", ""Herrera""]",749396
"""Solo Quédate En Silencio""",124,2020-09-04,"""RBD, Anahí, Dulce María, Maite Perroni, Christian Chávez, Christopher von Uckermann, Alfonso Herrera""","[""RBD,"", ""Anahí,"", ""Dulce"", ""María,"", ""Maite"", ""Perroni,"", ""Christian"", ""Chávez,"", ""Christopher"", ""von"", ""Uckermann,"", ""Alfonso"", ""Herrera""]",906122


Create a new column called `lead_artist` with the first listed artist from each list. 

Return only the `title`,`artist` and `lead_artist` columns

In [49]:
spotify_df.select(
    "title", "artist", pl.col("artist").str.split(" ").list.first().alias("lead_artist")
)

title,artist,lead_artist
str,str,str
"""Starboy""","""The Weeknd, Daft Punk""","""The"""
"""Closer""","""The Chainsmokers, Halsey""","""The"""
"""Let Me Love You""","""DJ Snake, Justin Bieber""","""DJ"""
"""Rockabye (feat. Sean Paul & Anne-Marie)""","""Clean Bandit""","""Clean"""
"""One Dance""","""Drake, WizKid, Kyla""","""Drake,"""
…,…,…
"""Slow Hands""","""Niall Horan""","""Niall"""
"""New Freezer (feat. Kendrick Lamar)""","""Rich The Kid""","""Rich"""
"""Explícale (feat. Bad Bunny)""","""Yandel""","""Yandel"""
"""The Scientist""","""Coldplay""","""Coldplay"""


Get the top 10 artists ranked by their maximum number of streams for a track
- Explode the `artists` list column so each artist is on their own row
- Group by the exploded artists
- Aggregate to get the maximum of the `streams`

In [50]:
spotify_df\
.with_columns(
    artists = pl.col("artist").str.split(",")
)\
.select("artists", "streams")\
.explode("artists")\
.group_by("artists")\
.agg(
    pl.col("streams").max()
)\
.top_k(
    k=10,
    by="streams"
)

artists,streams
str,i64
"""Adele""",19749704
"""Mariah Carey""",17223237
"""Wham!""",15813799
"""Olivia Rodrigo""",13714177
"""Drake""",12384750
"""Ariana Grande""",12229331
"""Michael Bublé""",11975033
"""Bobby Helms""",11924353
"""Brenda Lee""",11801426
"""Luis Fonsi""",11381520
