# Inequality joins

In [1]:
import polars as pl

## Nearest neighbor joins with `join_asof`

In [2]:
# People with their birth years
people_df = pl.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlotte', 'David', 'Eve'],
    'Birth_Year': [1950, 1965, 1985, 1996, 2015]
})
people_df

Name,Birth_Year
str,i64
"""Alice""",1950
"""Bob""",1965
"""Charlotte""",1985
"""David""",1996
"""Eve""",2015


In [3]:
generations_df = pl.DataFrame({
    'Generation': ['Baby Boomer', 'Gen X', 'Millennial', 'Gen Z', 'Gen Alpha'],
    'Start_Year': [1946, 1965, 1981, 1997, 2013]
})
generations_df

Generation,Start_Year
str,i64
"""Baby Boomer""",1946
"""Gen X""",1965
"""Millennial""",1981
"""Gen Z""",1997
"""Gen Alpha""",2013


In [4]:
people_df.join_asof(
    generations_df,
    left_on="Birth_Year",
    right_on="Start_Year"
)

Name,Birth_Year,Generation,Start_Year
str,i64,str,i64
"""Alice""",1950,"""Baby Boomer""",1946
"""Bob""",1965,"""Gen X""",1965
"""Charlotte""",1985,"""Millennial""",1981
"""David""",1996,"""Millennial""",1981
"""Eve""",2015,"""Gen Alpha""",2013


Default strategy is `backwards`, we can also choose `forwards` and `nearest`

In [5]:
people_df.join_asof(
    generations_df,
    left_on="Birth_Year",
    right_on="Start_Year",
    strategy="nearest"
)

Name,Birth_Year,Generation,Start_Year
str,i64,str,i64
"""Alice""",1950,"""Baby Boomer""",1946
"""Bob""",1965,"""Gen X""",1965
"""Charlotte""",1985,"""Millennial""",1981
"""David""",1996,"""Gen Z""",1997
"""Eve""",2015,"""Gen Alpha""",2013


Limit the difference by `tolerance`

In [6]:
people_df.join_asof(
    generations_df,
    left_on="Birth_Year",
    right_on="Start_Year",
    tolerance=5
)

Name,Birth_Year,Generation,Start_Year
str,i64,str,i64
"""Alice""",1950,"""Baby Boomer""",1946.0
"""Bob""",1965,"""Gen X""",1965.0
"""Charlotte""",1985,"""Millennial""",1981.0
"""David""",1996,,
"""Eve""",2015,"""Gen Alpha""",2013.0


## Inequality joins
We can join `DataFrames` based on one or more conditions.

In [8]:
# Customers with their budgets
customers_df = pl.DataFrame({
    'customer': ['John', 'Anna', 'Ravi', 'Fatima'],
    'budget': [120, 60, 180, 210],
    'min_bed_length': [200,180,190,180]
})

customers_df

customer,budget,min_bed_length
str,i64,i64
"""John""",120,200
"""Anna""",60,180
"""Ravi""",180,190
"""Fatima""",210,180


In [9]:
hotels_df = pl.DataFrame({
    'hotel': ['Budget Inn', 'Comfort Suites', 'Luxury Stay', 'Presidential Palace'],
    'price': [50, 100, 150, 200],
    'bed_length': [210,185,185,185]
})

hotels_df

hotel,price,bed_length
str,i64,i64
"""Budget Inn""",50,210
"""Comfort Suites""",100,185
"""Luxury Stay""",150,185
"""Presidential Palace""",200,185


In [11]:
customers_df.join_where(
    hotels_df,
    pl.col("budget") >= pl.col("price")
).select(
    "customer", "budget", "hotel", "price"
)

customer,budget,hotel,price
str,i64,str,i64
"""Fatima""",210,"""Presidential Palace""",200
"""Fatima""",210,"""Luxury Stay""",150
"""Fatima""",210,"""Comfort Suites""",100
"""Fatima""",210,"""Budget Inn""",50
"""Ravi""",180,"""Luxury Stay""",150
"""Ravi""",180,"""Comfort Suites""",100
"""Ravi""",180,"""Budget Inn""",50
"""John""",120,"""Comfort Suites""",100
"""John""",120,"""Budget Inn""",50
"""Anna""",60,"""Budget Inn""",50


In [12]:
customers_df.join_where(
    hotels_df,
    pl.col("budget") >= pl.col("price"),
    pl.col("bed_length") > pl.col("min_bed_length")
)

customer,budget,min_bed_length,hotel,price,bed_length
str,i64,i64,str,i64,i64
"""Anna""",60,180,"""Budget Inn""",50,210
"""Fatima""",210,180,"""Comfort Suites""",100,185
"""Fatima""",210,180,"""Luxury Stay""",150,185
"""Fatima""",210,180,"""Presidential Palace""",200,185
"""Fatima""",210,180,"""Budget Inn""",50,210
"""Ravi""",180,190,"""Budget Inn""",50,210
"""John""",120,200,"""Budget Inn""",50,210


## Exercises

In [13]:
taxi_df = (
    pl.read_parquet("data/nyc_trip_data_600k.parquet")
    .sort("pickup")
)
taxi_df.head()

VendorID,pickup,dropoff,passenger_count,trip_distance
i32,datetime[μs],datetime[μs],f32,f32
2,2022-01-01 00:00:08,2022-01-01 00:14:14,1.0,7.94
2,2022-01-01 00:00:11,2022-01-01 00:14:29,1.0,5.55
2,2022-01-01 00:00:17,2022-01-01 00:31:04,1.0,3.86
2,2022-01-01 00:00:18,2022-01-01 00:13:27,1.0,3.37
2,2022-01-01 00:00:20,2022-01-01 00:20:51,1.0,4.89


In [15]:
weather_df = (
    pl.read_parquet("data/nyc_weather.parquet")
)
weather_df.head()

time,temperature,precipitation
datetime[μs],f64,f64
2022-01-01 00:00:00,8.7,0.0
2022-01-01 01:00:00,8.7,0.0
2022-01-01 02:00:00,8.6,0.0
2022-01-01 03:00:00,8.5,0.0
2022-01-01 04:00:00,8.2,0.0


Join the weather data to the taxi data using an appropriate strategy

In [16]:
taxi_df.join_asof(
    weather_df,
    left_on="pickup",
    right_on="time",
    strategy="nearest"
).head()

VendorID,pickup,dropoff,passenger_count,trip_distance,time,temperature,precipitation
i32,datetime[μs],datetime[μs],f32,f32,datetime[μs],f64,f64
2,2022-01-01 00:00:08,2022-01-01 00:14:14,1.0,7.94,2022-01-01 00:00:00,8.7,0.0
2,2022-01-01 00:00:11,2022-01-01 00:14:29,1.0,5.55,2022-01-01 00:00:00,8.7,0.0
2,2022-01-01 00:00:17,2022-01-01 00:31:04,1.0,3.86,2022-01-01 00:00:00,8.7,0.0
2,2022-01-01 00:00:18,2022-01-01 00:13:27,1.0,3.37,2022-01-01 00:00:00,8.7,0.0
2,2022-01-01 00:00:20,2022-01-01 00:20:51,1.0,4.89,2022-01-01 00:00:00,8.7,0.0


Make a bar chart of the relationship between `precipitation` and `trip_distance`:
- first group by precipitation (rounded off to one decimal place)
- take the mean of trip distance
- make a bar chart with precipitation on the x-axis and trip distance on the y-axis
- add an appropriate title to the chart

In [20]:
taxi_df.join_asof(
    weather_df,
    left_on="pickup",
    right_on="time",
    strategy="nearest"
).group_by(pl.col("precipitation").round(1).cast(pl.Utf8)).agg(
    pl.col("trip_distance").mean()
).plot.bar(
    x="precipitation",
    y="trip_distance",
    color="precipitation"
).properties(width=700, title="Mean trip distance by precipitation level in mm")