Data Engineering - Skilling - Data Analysis and SQL - Replace case clause with pattern tables - Spark
=====================================================================================================

# References
* [Data Engineering helpers - Skilling - Data Analysis with SQL - Redame](https://github.com/data-engineering-helpers/data-engineering-skilling/blob/main/Data-Analysis-SQL.md#replace-case-clause-with-pattern-tables)
  * [Data Engineering helpers - Skilling - Data Analysis with SQL - Jupyter notebook adapted to PySpark (this notebook)](https://github.com/data-engineering-helpers/data-engineering-skilling/blob/main/notebooks/data-analysis-sql-001-replace-case-with-pattern-table-spark.ipynb)
  * [Data Engineering helpers - Skilling - Data Analysis with SQL - Original Jupyter notebook with DuckDB](https://github.com/data-engineering-helpers/data-engineering-skilling/blob/main/notebooks/data-analysis-sql-001-replace-case-with-pattern-table-duckdb.ipynb)
* Date: Aug. 2025
* Author: Matt Martin
  ([Matt Martin on LinkedIn](https://www.linkedin.com/in/mattmartin14/),
   [Matt Martin's Substack blog](https://substack.com/@performancede))
* Article on Substack: https://performancede.substack.com/p/handling-fuzzy-matching-of-transactions
* [Original Git repository with the code (Jupyter notebook)](https://github.com/mattmartin14/dream_machine/blob/main/substack/articles/2025.08.15-fuzzy_like_matching/scratchpad.ipynb)


# Setup

In [1]:
import sys, platform
print(sys.version)
print(platform.python_version())

3.12.11 (main, Jun 16 2025, 19:21:30) [Clang 17.0.0 (clang-1700.0.13.5)]
3.12.11


In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
print(spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/15 17:01:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


3.5.2


# Table/temporary view creations

## Grocery categories: pattern matching table

In [8]:
ddl_gc = """
create or replace temporary view grocery_categories as (
  select '%banana%' as item_pattern, 'fruit' as item_category
  union
  select '%apple%' as item_pattern, 'fruit' as item_category
  union
  select '%orange%' as item_pattern, 'fruit' as item_category
  union
  select '%grape%' as item_pattern, 'fruit' as item_category
  union
  select '%strawberry%' as item_pattern, 'fruit' as item_category
  union
  select '%lettuce%' as item_pattern, 'vegetable' as item_category
  union
  select '%spinach%' as item_pattern, 'vegetable' as item_category
  union
  select '%carrot%' as item_pattern, 'vegetable' as item_category
  union
  select '%broccoli%' as item_pattern, 'vegetable' as item_category
  union
  select '%cucumber%' as item_pattern, 'vegetable' as item_category
  union
  select '%chicken%' as item_pattern, 'meat' as item_category
  union
  select '%beef%' as item_pattern, 'meat' as item_category
  union
  select '%pork%' as item_pattern, 'meat' as item_category
  union
  select '%salmon%' as item_pattern, 'seafood' as item_category
  union
  select '%shrimp%' as item_pattern, 'seafood' as item_category
  union
  select '%milk%' as item_pattern, 'dairy' as item_category
  union
  select '%cheese%' as item_pattern, 'dairy' as item_category
  union
  select '%yogurt%' as item_pattern, 'dairy' as item_category
  union
  select '%bread%' as item_pattern, 'bakery' as item_category
  union
  select '%bagel%' as item_pattern, 'bakery' as item_category
);
"""
spark.sql(ddl_gc).toPandas()

In [9]:
select_gc = "select * from grocery_categories"
spark.sql(select_gc).toPandas()

Unnamed: 0,item_pattern,item_category
0,%banana%,fruit
1,%apple%,fruit
2,%orange%,fruit
3,%grape%,fruit
4,%strawberry%,fruit
5,%lettuce%,vegetable
6,%spinach%,vegetable
7,%carrot%,vegetable
8,%broccoli%,vegetable
9,%cucumber%,vegetable


## Transaction/main table

In [10]:
ddl_txns = """
create or replace temporary view txns as (
  select '2024-05-01' as pstd_dt, '4 Pack Banana' as item_desc, 2.99 as charge_amt
union
  select '2024-05-03' as pstd_dt, 'Shrimp 3 lbs' as item_desc, 15.50 as charge_amt
union
  select '2024-05-05' as pstd_dt, 'organic Apple' as item_desc, 1.25 as charge_amt
union
  select '2024-05-07' as pstd_dt, 'Whole milk' as item_desc, 3.49 as charge_amt
union
  select '2024-05-09' as pstd_dt, 'Bagel Dozen' as item_desc, 5.99 as charge_amt
union
  select '2024-05-11' as pstd_dt, 'CHEDDAR cheese' as item_desc, 4.25 as charge_amt
union
  select '2024-05-13' as pstd_dt, 'Baby Spinach' as item_desc, 2.50 as charge_amt
union
  select '2024-05-15' as pstd_dt, 'Carrot bunch' as item_desc, 1.99 as charge_amt
union
  select '2024-05-17' as pstd_dt, 'Beef Steak' as item_desc, 12.99 as charge_amt
union
  select '2024-05-19' as pstd_dt, 'Salmon Fillet' as item_desc, 13.75 as charge_amt
union
  select '2024-05-21' as pstd_dt, 'Grape Bunch' as item_desc, 3.10 as charge_amt
union
  select '2024-05-23' as pstd_dt, 'Lettuce Head' as item_desc, 1.75 as charge_amt
union
  select '2024-05-25' as pstd_dt, 'Pork Chops' as item_desc, 9.50 as charge_amt
union
  select '2024-05-27' as pstd_dt, 'YOGURT cup' as item_desc, 1.10 as charge_amt
union
  select '2024-05-29' as pstd_dt, 'Cucumber' as item_desc, 1.00 as charge_amt
union
  select '2024-06-01' as pstd_dt, 'Chicken breast' as item_desc, 8.25 as charge_amt
union
  select '2024-06-03' as pstd_dt, 'Strawberry Box' as item_desc, 4.00 as charge_amt
union
  select '2024-06-05' as pstd_dt, 'Orange Juice' as item_desc, 3.99 as charge_amt
union
  select '2024-06-07' as pstd_dt, 'Whole Wheat bread' as item_desc, 2.50 as charge_amt
union
  select '2024-06-09' as pstd_dt, 'Broccoli Crown' as item_desc, 2.20 as charge_amt
union
  select '2024-06-11' as pstd_dt, 'Bagel SANDWICH' as item_desc, 6.50 as charge_amt
union
  select '2024-06-13' as pstd_dt, 'Spinach Salad' as item_desc, 5.00 as charge_amt
union
  select '2024-06-15' as pstd_dt, 'Cheese PLATTER' as item_desc, 10.00 as charge_amt
union
  select '2024-06-17' as pstd_dt, 'Milk Chocolate' as item_desc, 2.75 as charge_amt
union
  select '2024-06-19' as pstd_dt, 'Apple Pie' as item_desc, 7.00 as charge_amt
union
  select '2024-06-21' as pstd_dt, 'Shrimp COCKTAIL' as item_desc, 14.00 as charge_amt
union
  select '2024-06-23' as pstd_dt, 'Banana Bread' as item_desc, 4.50 as charge_amt
union
  select '2024-06-25' as pstd_dt, 'Beef BURGER' as item_desc, 11.00 as charge_amt
union
  select '2024-06-27' as pstd_dt, 'PORK ribs' as item_desc, 13.00 as charge_amt
union
  select '2024-06-29' as pstd_dt, 'Grape Jelly' as item_desc, 2.80 as charge_amt
);
"""
spark.sql(ddl_txns).toPandas()

In [11]:
select_txns = "select * from txns"
spark.sql(select_txns).toPandas()

Unnamed: 0,pstd_dt,item_desc,charge_amt
0,2024-05-01,4 Pack Banana,2.99
1,2024-05-03,Shrimp 3 lbs,15.5
2,2024-05-05,organic Apple,1.25
3,2024-05-07,Whole milk,3.49
4,2024-05-09,Bagel Dozen,5.99
5,2024-05-11,CHEDDAR cheese,4.25
6,2024-05-13,Baby Spinach,2.5
7,2024-05-15,Carrot bunch,1.99
8,2024-05-17,Beef Steak,12.99
9,2024-05-19,Salmon Fillet,13.75


# Category resolution with the join on pattern table (rather than case clause)

In [13]:
select_with_pattern = """
select month(t.pstd_dt) as pstd_month,
       gc.item_category as grocery_category,
       sum(t.charge_amt) as total_charges
from txns as t
left join grocery_categories as gc 
       on t.item_desc ilike gc.item_pattern
group by pstd_month, grocery_category
order by pstd_month asc, total_charges desc
"""
spark.sql(select_with_pattern).toPandas()

Unnamed: 0,pstd_month,grocery_category,total_charges
0,5,seafood,29.25
1,5,meat,22.49
2,5,dairy,8.84
3,5,fruit,7.34
4,5,vegetable,7.24
5,5,bakery,5.99
6,6,meat,32.25
7,6,fruit,22.29
8,6,seafood,14.0
9,6,bakery,13.5
