# Combine Dallas and Chicago Stage Tables

In [0]:
from pyspark.sql import functions as F

sfOptions = {
    "sfURL": "KOMAXUA-FHA53164.snowflakecomputing.com",  
    "sfDatabase": "FOOD_INSPECTION_DB",                 
    "sfSchema": "RAW_STAGE_SCHEMA",                     
    "sfWarehouse": "DADABI_WH",                         
    "sfRole": "DEVELOPER",                              
    "sfUser": "DADABI_USER",                           
    "sfPassword": "snowflake123#"
}

In [0]:
# load Dallas
df_dallas_stg = (
    spark.read
         .format("snowflake")
         .options(**sfOptions)
         .option("dbtable", "STG_DALLAS")
         .load()
    # cast ID to string so types line up in the union
    .withColumn("inspection_id", F.col("inspection_id").cast("string"))
)

# load Chicago and suffix its IDs
df_chicago_stg = (
    spark.read
         .format("snowflake")
         .options(**sfOptions)
         .option("dbtable", "STG_CHICAGO")
         .load()
    .withColumn("inspection_id",
        F.concat(F.col("inspection_id").cast("string"), F.lit("_chicago"))
    )
)

df_dallas_stg.show(5)
df_chicago_stg.show(5)

+-------------+----------------+-------------+--------------------+-------+------+-----+---------------+---------------+----+-------+--------------+---------------------+--------------------+
|inspection_id|   BUSINESS_NAME|BUSINESS_TYPE|             ADDRESS|ZIPCODE|  CITY|STATE|INSPECTION_TYPE|INSPECTION_DATE|RISK|RESULTS|VIOLATION_CODE|VIOLATION_DESCRIPTION|  VIOLATION_COMMENTS|
+-------------+----------------+-------------+--------------------+-------+------+-----+---------------+---------------+----+-------+--------------+---------------------+--------------------+
| 10000_dallas|7-ELEVEN #38461A|      Unknown|2450 GUS THOMASSO...|  75228|Dallas|   TX|        Routine|     2021-11-22| Low|   Pass|         -9999|              Unknown|             Unknown|
| 10001_dallas|         FRESHII|      Unknown|2414 VICTORY PARK LN|  75219|Dallas|   TX|        Routine|     2022-09-17| Low|   Pass|            21|   RFSM - Not On Site|Sec. 17-2.2(c)(1)...|
| 10001_dallas|         FRESHII|      Un

In [0]:
df_chicago_stg.printSchema()

root
 |-- inspection_id: string (nullable = true)
 |-- BUSINESS_NAME: string (nullable = true)
 |-- BUSINESS_TYPE: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- ZIPCODE: decimal(38,0) (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- INSPECTION_TYPE: string (nullable = true)
 |-- INSPECTION_DATE: date (nullable = true)
 |-- RISK: string (nullable = true)
 |-- RESULTS: string (nullable = true)
 |-- VIOLATION_CODE: decimal(38,0) (nullable = true)
 |-- VIOLATION_DESCRIPTION: string (nullable = true)
 |-- VIOLATION_COMMENTS: string (nullable = true)



In [0]:
df_dallas_stg.printSchema()

root
 |-- inspection_id: string (nullable = false)
 |-- BUSINESS_NAME: string (nullable = true)
 |-- BUSINESS_TYPE: string (nullable = false)
 |-- ADDRESS: string (nullable = true)
 |-- ZIPCODE: decimal(38,0) (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- INSPECTION_TYPE: string (nullable = true)
 |-- INSPECTION_DATE: date (nullable = true)
 |-- RISK: string (nullable = false)
 |-- RESULTS: string (nullable = false)
 |-- VIOLATION_CODE: decimal(38,0) (nullable = true)
 |-- VIOLATION_DESCRIPTION: string (nullable = true)
 |-- VIOLATION_COMMENTS: string (nullable = true)



In [0]:
# Add source = "dallas"
df_dallas_stg = df_dallas_stg.withColumn("source", F.lit("dallas"))

# Add source = "chicago"
df_chicago_stg = df_chicago_stg.withColumn("source", F.lit("chicago"))

# Now union the two
df_combined = df_dallas_stg.unionByName(df_chicago_stg)

In [0]:
import uuid

job_id = str(uuid.uuid4())
df_combined_stg = df_combined.withColumn("job_id", F.lit(job_id)) \
                                   .withColumn("load_dt", F.current_timestamp().cast("date"))

In [0]:
df_combined_stg.show()

+-------------+--------------------+-------------+--------------------+-------+------+-----+---------------+---------------+----+-------+--------------+---------------------+--------------------+------+--------------------+----------+
|inspection_id|       BUSINESS_NAME|BUSINESS_TYPE|             ADDRESS|ZIPCODE|  CITY|STATE|INSPECTION_TYPE|INSPECTION_DATE|RISK|RESULTS|VIOLATION_CODE|VIOLATION_DESCRIPTION|  VIOLATION_COMMENTS|source|              job_id|   load_dt|
+-------------+--------------------+-------------+--------------------+-------+------+-----+---------------+---------------+----+-------+--------------+---------------------+--------------------+------+--------------------+----------+
| 10000_dallas|    7-ELEVEN #38461A|      Unknown|2450 GUS THOMASSO...|  75228|Dallas|   TX|        Routine|     2021-11-22| Low|   Pass|         -9999|              Unknown|             Unknown|dallas|74596cf1-124b-44c...|2025-04-20|
| 10001_dallas|             FRESHII|      Unknown|2414 VICTO

### Load final combine food inspection table to Snfowlake stage table

In [0]:
# write out (will create or overwrite the target table)
df_combined_stg.write \
    .format("snowflake") \
    .options(**sfOptions) \
    .option("dbtable", "STG_FINAL_TABLE") \
    .mode("overwrite") \
    .save()