# Categoricals and the string cache

In [1]:
import polars as pl

In [2]:
df = (
    pl.DataFrame(
        {
            "strings": ["c","b","a","c"], 
            "values": [1, 2, 3, 4]
        }
    )
    .with_columns(
        pl.col("strings").cast(pl.Categorical).alias("cats")
    )
)
df

strings,values,cats
str,i64,cat
"""c""",1,"""c"""
"""b""",2,"""b"""
"""a""",3,"""a"""
"""c""",4,"""c"""


## Filtering a categorical column
We filter a categorical column like the way in normal string column

In [4]:
df.filter(
    cats = "b"
)

strings,values,cats
str,i64,cat
"""b""",2,"""b"""


We can also filter a categorical column with `is_in`

In [5]:
df.filter(
    pl.col("cats").is_in(["b"])
)

strings,values,cats
str,i64,cat
"""b""",2,"""b"""


## Categorical from different `DataFrames`

When we combine `DataFrames` that have categorical, Polars needs to ensure that the same mapping is used from strings to integers in both `DataFrames`.

In [6]:
df_right = (
    pl.DataFrame(
        {
            "strings": ["a","b"], 
            "values": [10, 20]
        }
    )
    .with_columns(
        pl.col("strings").cast(pl.Categorical).alias("cats")
    )
)
df_right

strings,values,cats
str,i64,cat
"""a""",10,"""a"""
"""b""",20,"""b"""


In [None]:
df.join(
    df_right,
    on="cats",
    how="left",
    coalesce=True # merge join columns
)

strings,values,cats,strings_right,values_right
str,i64,cat,str,i64
"""c""",1,"""c""",,
"""b""",2,"""b""","""b""",20.0
"""a""",3,"""a""","""a""",10.0
"""c""",4,"""c""",,


## Combining categoricals with the `StringCache`
We can instead use a `StringCache` to ensure that different `DataFrames` have the same categorical mapping.

The `StringCache` object:
- stores the categorical mapping
- ensures that all `DataFrames` use the same mapping. 

We can use the `StringCache`:
- inside a context manager or by enabling it globally.

### Using the `StringCache` inside a context-manager

A context-manager is a way to ensure certain actions happen in Python.

Everything inside the code block beginning with `with` is in the same context.

In [10]:
with pl.StringCache():
    # Left df
    df = pl.DataFrame(
        {"strings": ["c", "b", "a", "c"], "values": [1, 2, 3, 4]}
    )\
    .with_columns(pl.col("strings").cast(pl.Categorical).alias("cats"))
    # Right df
    df_right = pl.DataFrame(
        {"strings": ["a", "b"], "values": [10, 20]}
    )\
    .with_columns(
        pl.col("strings").cast(pl.Categorical).alias("cats")
    )
    # Join
    df_joined = df.join(
        df_right,
        on="cats",
        how="left",
        coalesce=True
    )

df_joined

strings,values,cats,strings_right,values_right
str,i64,cat,str,i64
"""c""",1,"""c""",,
"""b""",2,"""b""","""b""",20.0
"""a""",3,"""a""","""a""",10.0
"""c""",4,"""c""",,


### Enabling the `StringCache`
We can enable the `StringCache` to be on through a session - be aware that this can have affects beyond this script/notebook.

In [11]:
# pl.enable_string_cache()

When we use `pl.enable_string_cache()` Polars enables a `StringCache` that is used by all categorical columns until:
- the end of the session or
- call `pl.disable_string_cache()`

You can see whether a string cache is enabled with 

In [12]:
pl.using_string_cache()

True

## Exercises

### Exercise 1
Create a `DataFrame` from the Titanic dataset and cast the `Pclass` column to categorical.

In [37]:
csv_file = "data/titanic.csv"

df = pl.read_csv(csv_file).with_columns(
    pl.col("Pclass").cast(pl.Utf8).cast(pl.Categorical)
)

df.head(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,cat,str,str,f64,i64,i64,str,f64,str,str
1,0,"""3""","""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,"""1""","""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,"""3""","""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,"""1""","""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,"""3""","""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


Continue by casting the `Embarked` column to categorical (change `with_column` to `with_columns`). 

In [38]:
df = df.with_columns(
    pl.col("Embarked").cast(pl.Categorical)
)

df.head(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,cat,str,str,f64,i64,i64,str,f64,str,cat
1,0,"""3""","""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,"""1""","""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,"""3""","""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,"""1""","""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,"""3""","""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


Filter the `Pclass` column for third class passengers

In [39]:
df.filter(
    Pclass = "3"
).head(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,cat,str,str,f64,i64,i64,str,f64,str,cat
1,0,"""3""","""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,"""3""","""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
5,0,"""3""","""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,"""3""","""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
8,0,"""3""","""Palsson, Master. Gosta Leonard""","""male""",2.0,3,1,"""349909""",21.075,,"""S"""


Add a filter on the `Embarked` column for passengers who embarked in either Southampton (`S`) or Queenstown (`Q`)

In [41]:
df.filter(
    (pl.col("Embarked").is_in(["S", "Q"])) & (pl.col("Pclass") == "3")
).head(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,cat,str,str,f64,i64,i64,str,f64,str,cat
1,0,"""3""","""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,"""3""","""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
5,0,"""3""","""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,"""3""","""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
8,0,"""3""","""Palsson, Master. Gosta Leonard""","""male""",2.0,3,1,"""349909""",21.075,,"""S"""


### Exercise 2
Filter the Spotify `DataFrame` to find all tracks by either Taylor Swift or Ed Sheeran.

In [29]:
spotify_csv = "data/spotify-charts-2017-2021-global-top200.csv.gz"

Enable the string cache

In [28]:
pl.enable_string_cache()

In [33]:
spotify_df = pl.read_csv(spotify_csv)

spotify_df.head(5)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,str,str,str,str,str,str,i64
"""Starboy""",1,"""2017-01-01""","""The Weeknd, Daft Punk""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",3135625
"""Closer""",2,"""2017-01-01""","""The Chainsmokers, Halsey""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",3015525
"""Let Me Love You""",3,"""2017-01-01""","""DJ Snake, Justin Bieber""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",2545384
"""Rockabye (feat. Sean Paul & An…",4,"""2017-01-01""","""Clean Bandit""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_DOWN""",2356604
"""One Dance""",5,"""2017-01-01""","""Drake, WizKid, Kyla""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",2259887


In [35]:
spotify_df.with_columns(
        pl.col("artist").cast(pl.Categorical)
    )\
    .filter(
        pl.col("artist").is_in(["Taylor Swift", "Ed Sheeran"])
    ).head(6)

title,rank,date,artist,url,region,chart,trend,streams
str,i64,str,cat,str,str,str,str,i64
"""Photograph""",123,"""2017-01-01""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",472142
"""Thinking out Loud""",141,"""2017-01-01""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",436476
"""Photograph""",91,"""2017-01-02""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",595989
"""Thinking out Loud""",109,"""2017-01-02""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""MOVE_UP""",536343
"""Perfect""",27,"""2018-03-01""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",1835597
"""Shape of You""",34,"""2018-03-01""","""Ed Sheeran""","""https://open.spotify.com/track…","""Global""","""top200""","""SAME_POSITION""",1616893


Then disable the string cache

In [36]:
pl.disable_string_cache()