# Left, inner, outer, cross and fast-track joins

By default, Polars will not join `null`.

`Inner Join` is the default join method in Polars.

In [1]:
import polars as pl

In [2]:
df_left = pl.DataFrame({"id": ["A", "B", "C", None], "val": [0, 1, 2, 3]})
df_left

id,val
str,i64
"""A""",0
"""B""",1
"""C""",2
,3


In [3]:
df_right = pl.DataFrame({"id": ["A", "C", None, "D"], "val": [10, 11, 12, 13]})
df_right

id,val
str,i64
"""A""",10
"""C""",11
,12
"""D""",13


## Left join
In a left join we return all the rows from the left `DataFrame` and the matched rows from the right `DataFrame`.

Pass the same column name to `on` to match two `DataFrame`

In [4]:
df_left.join(
    other=df_right,
    on="id",
    how="left"
)

id,val,val_right
str,i64,i64
"""A""",0,10.0
"""B""",1,
"""C""",2,11.0
,3,


`coalesce = False` shows two join column separately.

In [5]:
df_left.join(
    other=df_right,
    on="id",
    how="left",
    coalesce=False
)

id,val,id_right,val_right
str,i64,str,i64
"""A""",0,"""A""",10.0
"""B""",1,,
"""C""",2,"""C""",11.0
,3,,


`nulls_equal=True` to join `null` values

In [6]:
df_left.join(
    other=df_right,
    on="id",
    how="left",
    nulls_equal=True
)

id,val,val_right
str,i64,i64
"""A""",0,10.0
"""B""",1,
"""C""",2,11.0
,3,12.0


`suffix` to rename the column name

In [7]:
df_left.join(
    other=df_right,
    on="id",
    how="left",
    nulls_equal=True,
    suffix="_r"
)

id,val,val_r
str,i64,i64
"""A""",0,10.0
"""B""",1,
"""C""",2,11.0
,3,12.0


## Inner joins
Only retain the rows in both `DataFrames` where there is a matching join key

In [8]:
df_left.join(
    df_right,
    on="id"
)

id,val,val_right
str,i64,i64
"""A""",0,10
"""C""",2,11


In [9]:
df_left.join(
    df_right,
    on="id",
    nulls_equal=True
)

id,val,val_right
str,i64,i64
"""A""",0,10
"""C""",2,11
,3,12


## Cross join
With a cross join we get the Cartesian product of both tables - so we end up with each row of the left `DataFrame` matched with each row of the right `DataFrame` and there is `no join key`.

It's rarely used.

In [11]:
df_left.join(
    df_right,
    how="cross"
)

id,val,id_right,val_right
str,i64,str,i64
"""A""",0,"""A""",10
"""A""",0,"""C""",11
"""A""",0,,12
"""A""",0,"""D""",13
"""B""",1,"""A""",10
…,…,…,…
"""C""",2,"""D""",13
,3,"""A""",10
,3,"""C""",11
,3,,12


## Full outer join
Returns all rows when there is a match in either left or right `DataFrame`

In [13]:
df_left.join(
    df_right,
    on="id",
    how="full"
)

id,val,id_right,val_right
str,i64,str,i64
"""A""",0.0,"""A""",10.0
"""C""",2.0,"""C""",11.0
,,,12.0
,,"""D""",13.0
"""B""",1.0,,
,3.0,,


In [14]:
df_left.join(
    df_right,
    on="id",
    how="full",
    nulls_equal=True
)

id,val,id_right,val_right
str,i64,str,i64
"""A""",0.0,"""A""",10.0
"""C""",2.0,"""C""",11.0
,3.0,,12.0
,,"""D""",13.0
"""B""",1.0,,


In [15]:
df_left.join(
    df_right,
    on="id",
    how="full",
    coalesce=True
)

id,val,val_right
str,i64,i64
"""A""",0.0,10.0
"""C""",2.0,11.0
,,12.0
"""D""",,13.0
,3.0,
"""B""",1.0,


## Validating joins
Polars allows to validate joins to ensure there is no data lost.

In [16]:
df_left_valid = pl.DataFrame({"id": ["A", "B", "C", None], "val": [0, 1, 2, 3]})
df_left_valid

id,val
str,i64
"""A""",0
"""B""",1
"""C""",2
,3


In [17]:
df_right_valid = pl.DataFrame({"id": ["A", "C", None, "D"], "val": [10, 11, 12, 13]})
df_right_valid

id,val
str,i64
"""A""",10
"""C""",11
,12
"""D""",13


### 1:1 validation
With a 1:1 validation we want each row to match to a unique row in the other `DataFrames`.

In [18]:
df_left_valid.join(
    df_right_valid,
    on="id",
    how="left",
    validate="1:1"
)

id,val,val_right
str,i64,i64
"""A""",0,10.0
"""B""",1,
"""C""",2,11.0
,3,


### m:1 validation
Multiple rows with the same join key in the left `DataFrame` mapping to the same row in the right `DataFrame`

In [24]:
df_left_m = pl.DataFrame({"id": ["A", "A"], "val": [0, 1]})
df_right_m = pl.DataFrame({"id": ["A", "B"], "val": [10, 11]})

In [23]:
df_left_m.join(
    df_right_m,
    on="id",
    how="left",
    validate="m:1"
)

id,val,val_right
str,i64,i64
"""A""",0,
"""A""",1,


## Joining on multiple keys and expressions

In [25]:
df_left_multiple = (
    pl.DataFrame(
        {
            "id": ["A", "B", "A", "B"], 
            "year": [2020, 2020, 2021, 2021], 
            "val": [0, 1, 2, 3]
        }
    )
)
df_left_multiple

id,year,val
str,i64,i64
"""A""",2020,0
"""B""",2020,1
"""A""",2021,2
"""B""",2021,3


In [27]:
df_right_multiple = (
    pl.DataFrame(
        {
            "id": ["a", "b", "a", "b"],
            "year": [2020, 2020, 2021, 2021],
            "val": [10, 11, 12, 13],
        }
    )
)

df_right_multiple

id,year,val
str,i64,i64
"""a""",2020,10
"""b""",2020,11
"""a""",2021,12
"""b""",2021,13


In [28]:
df_left_multiple.join(
    df_right_multiple,
    on=[pl.col("id").str.to_uppercase(), "year"],
    how="inner"
)

id,year,val,id_right,year_right,val_right
str,i64,i64,str,i64,i64
"""A""",2020,0,"""a""",2020,10
"""B""",2020,1,"""b""",2020,11
"""A""",2021,2,"""a""",2021,12
"""B""",2021,3,"""b""",2021,13


## Joins in lazy mode
Do joins in lazy mode by joining on `LazyFrames` instead of `DataFrames`

In [32]:
df_left.lazy().join(
    df_right.lazy(),
    on="id",
    how="inner"
).select(
    "id", "val_right"
).collect()

id,val_right
str,i64
"""A""",10
"""C""",11


## Exercises

### CITES Dataset

In [33]:
csv_file = "data/cites_extract.csv"

In [34]:
df_CITES = pl.read_csv(csv_file)
df_CITES

Year,Importer,Exporter,Taxon,Quantity
i64,str,str,str,f64
2021,"""KR""","""DE""","""Python reticulatus""",12.0
2021,"""TR""","""DE""","""Python reticulatus""",2.0
2021,"""NZ""","""DE""","""Python bivittatus""",2.0
2021,"""TH""","""BJ""","""Python regius""",200.0
2021,"""KR""","""CZ""","""Python bivittatus""",28.0
2021,"""TW""","""DE""","""Python reticulatus""",1.0
2021,"""UA""","""DE""","""Python reticulatus""",4.0


The `DataFrame` shows:
- the `Year` in which the trade occured
- the `Importer` and `Exporter` country in 2-digit ISO country codes
- the scientific name for the `Taxon` and
- the `Quantity` of items in the trade

In [35]:
iso_csv_file = "data/countries_extract.csv"

In [36]:
df_ISO = pl.read_csv(iso_csv_file)
df_ISO

alpha-2,name,region
str,str,str
"""BJ""","""Benin""","""Africa"""
"""CZ""","""Czechia""","""Europe"""
"""KR""","""Korea, Republic of""","""Asia"""
"""NZ""","""New Zealand""","""Oceania"""
"""TW""","""Taiwan, Province of China""","""Asia"""
"""TH""","""Thailand""","""Asia"""
"""TR""","""Turkey""","""Asia"""


This `DataFrame` has:
- `alpha-2`: the 2-letter country code
- `name`: the full name of the country
- `region`: the region of the country

### Exercise 1
For each trade record in `df_CITES` add:
- the full country name of the importer
- the region of the importer

In [37]:
df_CITES.join(
    df_ISO,
    left_on="Importer",
    right_on="alpha-2",
    how="left"
)

Year,Importer,Exporter,Taxon,Quantity,name,region
i64,str,str,str,f64,str,str
2021,"""KR""","""DE""","""Python reticulatus""",12.0,"""Korea, Republic of""","""Asia"""
2021,"""TR""","""DE""","""Python reticulatus""",2.0,"""Turkey""","""Asia"""
2021,"""NZ""","""DE""","""Python bivittatus""",2.0,"""New Zealand""","""Oceania"""
2021,"""TH""","""BJ""","""Python regius""",200.0,"""Thailand""","""Asia"""
2021,"""KR""","""CZ""","""Python bivittatus""",28.0,"""Korea, Republic of""","""Asia"""
2021,"""TW""","""DE""","""Python reticulatus""",1.0,"""Taiwan, Province of China""","""Asia"""
2021,"""UA""","""DE""","""Python reticulatus""",4.0,,


Add to the trade records:
- the full country name of the importer
- the region of the importer

keeping only rows where we can join these values

In [38]:
df_CITES.join(
    df_ISO,
    left_on="Importer",
    right_on="alpha-2",
    how="inner"
)

Year,Importer,Exporter,Taxon,Quantity,name,region
i64,str,str,str,f64,str,str
2021,"""KR""","""DE""","""Python reticulatus""",12.0,"""Korea, Republic of""","""Asia"""
2021,"""KR""","""CZ""","""Python bivittatus""",28.0,"""Korea, Republic of""","""Asia"""
2021,"""NZ""","""DE""","""Python bivittatus""",2.0,"""New Zealand""","""Oceania"""
2021,"""TW""","""DE""","""Python reticulatus""",1.0,"""Taiwan, Province of China""","""Asia"""
2021,"""TH""","""BJ""","""Python regius""",200.0,"""Thailand""","""Asia"""
2021,"""TR""","""DE""","""Python reticulatus""",2.0,"""Turkey""","""Asia"""


Add:
- the full country name of the importer
- the region of the importer

keeping all rows from both `DataFrames`

In [41]:
df_CITES.join(
    df_ISO,
    left_on="Importer",
    right_on="alpha-2",
    how="full"
)

Year,Importer,Exporter,Taxon,Quantity,alpha-2,name,region
i64,str,str,str,f64,str,str,str
,,,,,"""BJ""","""Benin""","""Africa"""
,,,,,"""CZ""","""Czechia""","""Europe"""
2021.0,"""KR""","""DE""","""Python reticulatus""",12.0,"""KR""","""Korea, Republic of""","""Asia"""
2021.0,"""KR""","""CZ""","""Python bivittatus""",28.0,"""KR""","""Korea, Republic of""","""Asia"""
2021.0,"""NZ""","""DE""","""Python bivittatus""",2.0,"""NZ""","""New Zealand""","""Oceania"""
2021.0,"""TW""","""DE""","""Python reticulatus""",1.0,"""TW""","""Taiwan, Province of China""","""Asia"""
2021.0,"""TH""","""BJ""","""Python regius""",200.0,"""TH""","""Thailand""","""Asia"""
2021.0,"""TR""","""DE""","""Python reticulatus""",2.0,"""TR""","""Turkey""","""Asia"""
2021.0,"""UA""","""DE""","""Python reticulatus""",4.0,,,


Create a `DataFrame` that has all combinations of the `Taxon`,`name` and `region` columns with non-duplicated rows

In [43]:
df_CITES.select(
    "Taxon"
).join(
    df_ISO.select("name", "region"),
    how="cross"
).unique()

Taxon,name,region
str,str,str
"""Python reticulatus""","""Thailand""","""Asia"""
"""Python bivittatus""","""Benin""","""Africa"""
"""Python reticulatus""","""Taiwan, Province of China""","""Asia"""
"""Python regius""","""Turkey""","""Asia"""
"""Python bivittatus""","""Taiwan, Province of China""","""Asia"""
…,…,…
"""Python regius""","""Benin""","""Africa"""
"""Python regius""","""Czechia""","""Europe"""
"""Python regius""","""Korea, Republic of""","""Asia"""
"""Python bivittatus""","""Turkey""","""Asia"""


Returning to the `inner` join above validate that the trade records map to unique ISO metadata

In [44]:
df_CITES.join(
    df_ISO,
    left_on="Importer",
    right_on="alpha-2",
    how="left",
    validate="m:1"
)

Year,Importer,Exporter,Taxon,Quantity,name,region
i64,str,str,str,f64,str,str
2021,"""KR""","""DE""","""Python reticulatus""",12.0,"""Korea, Republic of""","""Asia"""
2021,"""TR""","""DE""","""Python reticulatus""",2.0,"""Turkey""","""Asia"""
2021,"""NZ""","""DE""","""Python bivittatus""",2.0,"""New Zealand""","""Oceania"""
2021,"""TH""","""BJ""","""Python regius""",200.0,"""Thailand""","""Asia"""
2021,"""KR""","""CZ""","""Python bivittatus""",28.0,"""Korea, Republic of""","""Asia"""
2021,"""TW""","""DE""","""Python reticulatus""",1.0,"""Taiwan, Province of China""","""Asia"""
2021,"""UA""","""DE""","""Python reticulatus""",4.0,,


Do a left join of the ISO data based on the importer (as earlier) and the same for the exporter. 

Ensure the `name` and `region` columns for importer and exporter are clearly distinguished in the output

In [45]:
df_CITES.join(
    df_ISO,
    left_on="Importer",
    right_on="alpha-2",
    how="left",
    suffix="_importer"
).join(
    df_ISO,
    left_on="Exporter",
    right_on="alpha-2",
    how="left",
    suffix="_exporter"
)

Year,Importer,Exporter,Taxon,Quantity,name,region,name_exporter,region_exporter
i64,str,str,str,f64,str,str,str,str
2021,"""KR""","""DE""","""Python reticulatus""",12.0,"""Korea, Republic of""","""Asia""",,
2021,"""TR""","""DE""","""Python reticulatus""",2.0,"""Turkey""","""Asia""",,
2021,"""NZ""","""DE""","""Python bivittatus""",2.0,"""New Zealand""","""Oceania""",,
2021,"""TH""","""BJ""","""Python regius""",200.0,"""Thailand""","""Asia""","""Benin""","""Africa"""
2021,"""KR""","""CZ""","""Python bivittatus""",28.0,"""Korea, Republic of""","""Asia""","""Czechia""","""Europe"""
2021,"""TW""","""DE""","""Python reticulatus""",1.0,"""Taiwan, Province of China""","""Asia""",,
2021,"""UA""","""DE""","""Python reticulatus""",4.0,,,,


### Exercise 2

In [47]:
import numpy as np

np.random.seed(0)

N = 100_000
cardinality = N // 2

We create a left-hand `DataFrame` with:
- a sorted `id` column and
- a random `values` column

We create a right-hand `DataFrame` with
- a sorted `id` column
- a metadata column (equal to the `id` column in this case)

In [48]:
def createDataFrames(N: int, cardinality: int):
    # Create a random array with values up to cardinality and then sort it to be the `id` column
    sortedArray = np.sort(np.random.randint(0, cardinality, N,dtype=np.int64))
    df_left = pl.DataFrame({"id": sortedArray, "values": np.random.standard_normal(N)})
    # We create the right-hand `DataFrame` with the `id` column and arbitrary metadata
    df_right = pl.DataFrame(
        {"id": [i for i in range(cardinality)], "meta": [i for i in range(cardinality)]}
    )
    return df_left, df_right


df_left, df_right = createDataFrames(N=N, cardinality=cardinality)
df_left.head()

id,values
i64,f64
0,-0.572911
0,1.291253
0,-0.693999
1,-0.083364
2,-1.685631


In [49]:
df_right.head()

id,meta
i64,i64
0,0
1,1
2,2
3,3
4,4


Check the flags if Polars knows the `id` column is sorted on the left and right `DataFrames`

In [51]:
print(df_left["id"].flags)
print(df_right["id"].flags)

{'SORTED_ASC': False, 'SORTED_DESC': False}
{'SORTED_ASC': False, 'SORTED_DESC': False}


Time the performance for an unsorted join

In [54]:
%%timeit -n1 -r3

df_left.join(
    df_right,
    on="id"
)

7.55 ms ± 797 μs per loop (mean ± std. dev. of 3 runs, 1 loop each)


Create new `DataFrames` and tell Polars that the `id` columns are sorted

In [55]:
df_left_sorted = df_left.with_columns(pl.col("id").set_sorted())

df_right_sorted = df_right.with_columns(pl.col("id").set_sorted())

Check the flags to see if Polars knows the `id` column is sorted on these new `DataFrames`

In [56]:
print(df_left_sorted["id"].flags)
print(df_right_sorted["id"].flags)

{'SORTED_ASC': True, 'SORTED_DESC': False}
{'SORTED_ASC': True, 'SORTED_DESC': False}


Time the sorted join performance

In [59]:
%%timeit -n1 -r3

df_left_sorted.join(
    df_right_sorted,
    on="id"
)

3.05 ms ± 621 μs per loop (mean ± std. dev. of 3 runs, 1 loop each)


Compare performance if only the left `DataFrame` is sorted. Hint: use `df_left_sorted` and `df_right`

In [60]:
%%timeit -n1 -r3

df_left_sorted.join(
    df_right,
    on="id"
)

9.2 ms ± 4.42 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
