# Join on enum and categorical columns

In [1]:
import polars as pl
import numpy as np
np.random.seed(0)

In [2]:
integer_array = np.array([3,3,1,2])
integer_array

array([3, 3, 1, 2])

In [3]:
df_left = (
    pl.DataFrame(
        {
            "id":[f"id{i}" for i in integer_array],
            "values":integer_array
        }
    )
)
df_left

id,values
str,i64
"""id3""",3
"""id3""",3
"""id1""",1
"""id2""",2


In [4]:
df_right = (
    pl.DataFrame(
        {
            "id":[f"id{i}" for i in range(1,4)],
            "metadata":[i for i in range(1,4)]
        }
    )
)
df_right

id,metadata
str,i64
"""id1""",1
"""id2""",2
"""id3""",3


The join key has the same data type.

In [5]:
df_left.join(
    df_right,
    on="id"
)

id,values,metadata
str,i64,i64
"""id3""",3,3
"""id3""",3,3
"""id1""",1,1
"""id2""",2,2


> Polars doesn't support the fast-track algorithm for joining string column, but works on integer.

## Joins on categorical dtype

In [6]:
df_left_cat = (
    pl.DataFrame(
        {
            "id":[f"id{i}" for i in integer_array],
            "values":integer_array
        }
    )
    .with_columns(
        pl.col("id").cast(pl.Categorical)
    )
)
df_left_cat

id,values
cat,i64
"""id3""",3
"""id3""",3
"""id1""",1
"""id2""",2


In [7]:
df_right_cat = (
    pl.DataFrame(
        {
            "id":[f"id{i}" for i in range(1,4)],
            "metadata":[i for i in range(1,4)]
        }
    )
    .with_columns(
        pl.col("id").cast(pl.Categorical)
    )
)
df_right_cat

id,metadata
cat,i64
"""id1""",1
"""id2""",2
"""id3""",3


In [9]:
df_left_cat.join(
    df_right_cat,
    on="id"
)

id,values,metadata
cat,i64,i64
"""id3""",3,3
"""id3""",3,3
"""id1""",1,1
"""id2""",2,2


If you meet the warning from Polars when joining the categorical, try to use `StringCache` to avoid it.

In [10]:
with pl.StringCache():
    df_left_cat = (
        pl.DataFrame(
            {
                "id":[f"id{i}" for i in integer_array],
                "values":integer_array
            }
        )
        .with_columns(
            pl.col("id").cast(pl.Categorical)
        )
    )

    df_right_cat = (
        pl.DataFrame(
            {
                "id":[f"id{i}" for i in range(1,4)],
                "metadata":[i for i in range(1,4)]
            }
        )
        .with_columns(
            pl.col("id").cast(pl.Categorical)
        )
    )

In [11]:
df_left_cat.join(
    df_right_cat,
    on="id"
)

id,values,metadata
cat,i64,i64
"""id3""",3,3
"""id3""",3,3
"""id1""",1,1
"""id2""",2,2


## Joining `pl.Enum` columns

We can join `pl.Enum` columns without `StringCache`.

In [12]:
enum_dtype = pl.Enum(["id1","id2","id3"])

In [13]:
df_left_enum = (
    pl.DataFrame(
        {
            "id":[f"id{i}" for i in integer_array],
            "values":integer_array
        }
    )
    .with_columns(
        pl.col("id").cast(enum_dtype)
    )
)
df_left_enum

id,values
enum,i64
"""id3""",3
"""id3""",3
"""id1""",1
"""id2""",2


In [14]:
df_right_enum = (
    pl.DataFrame(
        {
            "id":[f"id{i}" for i in range(1,4)],
            "metadata":[i for i in range(1,4)]
        }
    )
    .with_columns(
        pl.col("id").cast(enum_dtype)
    )
)
df_right_enum

id,metadata
enum,i64
"""id1""",1
"""id2""",2
"""id3""",3


In [15]:
df_left_cat.join(
    df_right_cat,
    on="id"
)

id,values,metadata
cat,i64,i64
"""id3""",3,3
"""id3""",3,3
"""id1""",1,1
"""id2""",2,2


## Exercises

## Exercise 1

In [16]:
cites_csv_file = "data/cites_extract.csv"
iso_csv_file = "data/countries_extract.csv"

We want to join the ISO data for importers and exporters.

- create `DataFrames` from the CITES trade data and ISO country data in the following CSVs
- cast the join columns to categorical

In [23]:
df_CITES = pl.read_csv(cites_csv_file).with_columns(
    pl.col("Importer").cast(pl.Categorical()),
    pl.col("Exporter").cast(pl.Categorical()),
)

df_CITES.head()

Year,Importer,Exporter,Taxon,Quantity
i64,cat,cat,str,f64
2021,"""KR""","""DE""","""Python reticulatus""",12.0
2021,"""TR""","""DE""","""Python reticulatus""",2.0
2021,"""NZ""","""DE""","""Python bivittatus""",2.0
2021,"""TH""","""BJ""","""Python regius""",200.0
2021,"""KR""","""CZ""","""Python bivittatus""",28.0


In [24]:
df_ISO = pl.read_csv(iso_csv_file).with_columns(
    pl.col("alpha-2").cast(pl.Categorical())
)

df_ISO.head()

alpha-2,name,region
cat,str,str
"""BJ""","""Benin""","""Africa"""
"""CZ""","""Czechia""","""Europe"""
"""KR""","""Korea, Republic of""","""Asia"""
"""NZ""","""New Zealand""","""Oceania"""
"""TW""","""Taiwan, Province of China""","""Asia"""


Join the ISO data for both importers and exporters

In [30]:
df_CITES.join(
    df_ISO,
    left_on="Importer",
    right_on="alpha-2",
    how="left",
).rename({
    "name": "name_importer",
    "region": "region_importer"
}).join(
    df_ISO,
    left_on="Exporter",
    right_on="alpha-2",
    how="left",
).rename({
    "name": "name_exporter",
    "region": "region_exporter"
})

Year,Importer,Exporter,Taxon,Quantity,name_importer,region_importer,name_exporter,region_exporter
i64,cat,cat,str,f64,str,str,str,str
2021,"""KR""","""DE""","""Python reticulatus""",12.0,"""Korea, Republic of""","""Asia""",,
2021,"""TR""","""DE""","""Python reticulatus""",2.0,"""Turkey""","""Asia""",,
2021,"""NZ""","""DE""","""Python bivittatus""",2.0,"""New Zealand""","""Oceania""",,
2021,"""TH""","""BJ""","""Python regius""",200.0,"""Thailand""","""Asia""","""Benin""","""Africa"""
2021,"""KR""","""CZ""","""Python bivittatus""",28.0,"""Korea, Republic of""","""Asia""","""Czechia""","""Europe"""
2021,"""TW""","""DE""","""Python reticulatus""",1.0,"""Taiwan, Province of China""","""Asia""",,
2021,"""UA""","""DE""","""Python reticulatus""",4.0,,,,


We now do this join using a `pl.Enum`.

In [31]:
df_CITES = pl.read_csv(cites_csv_file)
df_ISO = pl.read_csv(iso_csv_file)

First create a `pl.Enum` that holds all of the 2-digit country codes - this requires data from both `DataFrames`

In [32]:
countries_enum = pl.Enum(
    list(
        set(
            df_ISO["alpha-2"].unique().to_list() + df_CITES["Importer"].unique().to_list())
    )
)
countries_enum

Enum(categories=['BJ', 'CZ', 'KR', 'TH', 'TR', 'UA', 'TW', 'NZ'])

Do the first part of the left-join above of `df_ISO` to `df_CITES` with the join columns as `pl.Enum` dtypes

In [36]:
df_CITES.with_columns(
    pl.col("Importer").cast(countries_enum)
).sort("Importer").join(
    df_ISO.with_columns(
        pl.col("alpha-2").cast(countries_enum)
    ),
    left_on="Importer",
    right_on="alpha-2",
    how="left",
    coalesce=True
).rename({
    "name": "name_importer",
    "region": "region_importer"
}).sort("Exporter")

Year,Importer,Exporter,Taxon,Quantity,name_importer,region_importer
i64,enum,str,str,f64,str,str
2021,"""TH""","""BJ""","""Python regius""",200.0,"""Thailand""","""Asia"""
2021,"""KR""","""CZ""","""Python bivittatus""",28.0,"""Korea, Republic of""","""Asia"""
2021,"""KR""","""DE""","""Python reticulatus""",12.0,"""Korea, Republic of""","""Asia"""
2021,"""TR""","""DE""","""Python reticulatus""",2.0,"""Turkey""","""Asia"""
2021,"""UA""","""DE""","""Python reticulatus""",4.0,,
2021,"""TW""","""DE""","""Python reticulatus""",1.0,"""Taiwan, Province of China""","""Asia"""
2021,"""NZ""","""DE""","""Python bivittatus""",2.0,"""New Zealand""","""Oceania"""


## Exercise 2

In [37]:
N = 1_000_000
# cardinality is number of unique values
cardinality = N // 2
def createLeftDataFrame(N:int,cardinality:int):
    """
    Create the left dataframe with columns:
    id - random strings of the form idX where X is between 0 and 0
    values - the integer X value
    physical - the physical integers underlying the categorical id column
    """
    # create the random integer array
    integer_array = np.random.randint(0,cardinality,N)
    return (
    pl.DataFrame(
        {
            "id":[f"id{i}" for i in integer_array],
            "values":integer_array
        }
    )
    .with_columns(
        pl.col("id").cast(pl.Categorical)
    )
    .with_columns(
        pl.col("id").to_physical().alias("physical")
    )
)
df_left = createLeftDataFrame(N = N,cardinality=cardinality)
df_left.head()

id,values,physical
cat,i32,u32
"""id461484""",461484,15
"""id305711""",305711,16
"""id435829""",435829,17
"""id117952""",117952,18
"""id439107""",439107,19


In [38]:
def createRightDataFrame(N:int,cardinality:int):
    """
    Create the right dataframe with columns:
    id - the string ids covering the same range as the left dataframe
    meta - a metadata column that has the integer number from the id
    physical - the physical integers underlying the categorical id column
    """
    return (
    pl.DataFrame(
        {
            "id":[f"id{i}" for i in range(cardinality)],
            "meta":[i for i in range(cardinality)]
        }
    )
    .with_columns(
        pl.col("id").cast(pl.Categorical)
    )
    .with_columns(
        pl.col("id").to_physical().alias("physical")
    )

)
df_right = createRightDataFrame(N = N,cardinality=cardinality)
df_right.head(3)

id,meta,physical
cat,i64,u32
"""id0""",0,217773
"""id1""",1,1
"""id2""",2,2


Create `df_left` and `df_right` inside a `StringCache`

In [39]:
N = 1_000_000
cardinality = N // 2

with pl.StringCache():
    df_left = createLeftDataFrame(N = N,cardinality=cardinality)
    df_right = createRightDataFrame(N = N,cardinality=cardinality)

Time how long it takes to join on unsorted categorical columns

In [40]:
%%timeit -n1 -r3

df_left.join(
    df_right,
    on="id"
)

66.9 ms ± 16.3 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


Sort the categorical columns in new `DataFrames`

In [41]:
df_left_sorted = df_left.sort("id")
df_right_sorted = df_right.sort("id")

Time how long it takes to join on sorted categorical columns

In [42]:
%%timeit -n1 -r3

df_left_sorted.join(
    df_right_sorted,
    on="id"
)

58 ms ± 8.87 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


Cast the categorical columns to strings in new `DataFrames`

In [44]:
df_left_string = df_left.with_columns(
    pl.col("id").cast(pl.Utf8)
)

df_right_string = df_right.with_columns(
    pl.col("id").cast(pl.Utf8)
)

Time how long it takes to join on string columns

In [45]:
%%timeit -n1 -r1

df_left_string.join(
    df_right_string,
    on="id"
)

123 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Now cast the `id` columns to a `pl.Enum` dtype

In [53]:
id_enum = pl.Enum(
    pl.concat(
        [
            df_left.select(pl.col("id")).cast(pl.String),
            df_right.select(pl.col("id")).cast(pl.String)
        ]
    )["id"].unique().to_list()
)

df_left_enum = df_left.with_columns(pl.col("id").cast(pl.String).cast(id_enum))
df_right_enum = df_right.with_columns(pl.col("id").cast(pl.String).cast(id_enum))

Time the join with the `pl.Enum` dtype

In [54]:
%%timeit -n1 -r3

df_left_enum.join(
    df_right_enum,
    on="id"
)

71.8 ms ± 40.6 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
