# Chicago Food Inspection Data Transformation

### Read Chicago parquet file from silver container in azure data lake

In [0]:
# Define storage account info
storage_account_name = "foodinspection2025stg"
storage_account_key = "uI7JBnr/H6GB9Qhcglge+1gciYSvBpIm/G98cSsiJp6Cos+kqBfDQClmGCWWpZ+wXGtP3SsVeCbj+AStiA9Jvg=="
container_name = "silver"
mount_point = "/mnt/silver"

# Configure the mount with the storage credentials
configs = {
  f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key
}

try:
  dbutils.fs.mount(
    source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net",
    mount_point = mount_point,
    extra_configs = configs
  )
  print(f"Successfully mounted {container_name} to {mount_point}")
except Exception as e:
  print(f"Mount point already exists or error: {str(e)}")

df_chicago = spark.read.parquet(f"{mount_point}/chicago_parquet.parquet")

# Display sample data
df_chicago.show(5)

# Get schema information
df_chicago.printSchema()

# Count rows
row_count = df_chicago.count()
print(f"Total rows: {row_count}")

Mount point already exists or error: An error occurred while calling o399.mount.
: java.rmi.RemoteException: java.lang.IllegalArgumentException: requirement failed: Directory already mounted: /mnt/silver; nested exception is: 
	java.lang.IllegalArgumentException: requirement failed: Directory already mounted: /mnt/silver
	at com.databricks.backend.daemon.data.client.DbfsClient.send0(DbfsClient.scala:135)
	at com.databricks.backend.daemon.data.client.DbfsClient.sendIdempotent(DbfsClient.scala:69)
	at com.databricks.backend.daemon.dbutils.DBUtilsCore.createOrUpdateMount(DBUtilsCore.scala:1053)
	at com.databricks.backend.daemon.dbutils.DBUtilsCore.$anonfun$mount$1(DBUtilsCore.scala:1079)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:571)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:667)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:685)
	at com.dat

### Transformations

Lookup license_number to fill facility type, if exists.

Split violations into code, description, and comments

Rules followed for transformation:
1. if violation code is not present, and description is not present -> insert (9999, 'Unknown')
2. if violation code is not present, and description is present -> insert (-1, description)
3. if violation code is present, and description is present -> insert (code, desc)

In [0]:
from pyspark.sql import functions as F

df_chicago.filter(F.col("license_number") == 2967251).select("license_number", "facility_type", "business_name").show(truncate=False)

+--------------+-------------+-------------+
|license_number|facility_type|business_name|
+--------------+-------------+-------------+
|2967251       |Restaurant   |MANO A MANO  |
|2967251       |Restaurant   |MANO A MANO  |
|2967251       |Unknown      |MANO A MANO  |
+--------------+-------------+-------------+



In [0]:
# Create a mapping of license_number → known facility_type
facility_lookup = df_chicago \
    .filter(F.col("facility_type") != "Unknown") \
    .groupBy("license_number") \
    .agg(F.first("facility_type", ignorenulls=True).alias("facility_type"))


# Join the original data with the lookup table on license_number
df_enriched = df_chicago.alias("main") \
    .join(
        facility_lookup.alias("lookup"),
        on="license_number",
        how="left"
    )

# Replace 'Unknown' with matched value using coalesce
df_chicago_data = df_enriched.withColumn(
    "facility_type_filled",
    F.coalesce(
        F.when(F.col("main.facility_type") == "Unknown", F.col("lookup.facility_type")),
        F.col("main.facility_type")
    )
).drop("facility_type").withColumnRenamed("facility_type_filled", "facility_type")


In [0]:
df_chicago_data.filter(F.col('license_number')==2967251).select("license_number", "business_name","facility_type").show()

+--------------+-------------+-------------+
|license_number|business_name|facility_type|
+--------------+-------------+-------------+
|       2967251|  MANO A MANO|   Restaurant|
|       2967251|  MANO A MANO|   Restaurant|
|       2967251|  MANO A MANO|   Restaurant|
+--------------+-------------+-------------+



In [0]:

df_chicago_copy = df_chicago_data.select("*")

df_arrays = (
    df_chicago_copy
    # strip leading/trailing quotes (if any) then split & drop empty pieces
    .withColumn(
        "violations_arr",
        F.array_remove(
            F.split(
                F.regexp_replace(F.col("violations"), r'^"+|"+$', ""),
                r"\s*\|\s*"
            ),
            ""  
        )
    )
    .withColumn("violation_code",
        F.transform("violations_arr", lambda v: 
            F.when(F.lower(F.trim(v)) == "unknown", F.lit(-9999))
                .when(F.regexp_extract(v, r"^(\d+)\.", 1) != "",
                    F.regexp_extract(v, r"^(\d+)\.", 1).cast("int"))
                .when((F.regexp_extract(v, r"^(\d+)\.", 1) == "") &
                    (F.regexp_extract(v, r"^\d+\.\s*([^-]+)", 1) != ""), 
                    F.lit(-1))
                .otherwise(F.lit(-9999))
        )
    )
    # build an array of descriptions
    .withColumn("violation_description",
        F.transform("violations_arr", lambda v:
            F.when(F.lower(F.trim(v)) == "unknown", F.lit("Unknown"))
             .when(
                 F.regexp_extract(F.trim(v),
                                  r"^\d+\.\s*(.*?)\s*-\s*Comments:", 1) != "",
                 F.regexp_extract(F.trim(v),
                                  r"^\d+\.\s*(.*?)\s*-\s*Comments:", 1)
             )
             .otherwise(F.lit("Unknown"))
        )
    )
    # build an array of comments
    .withColumn("violation_comments",
        F.transform("violations_arr", lambda v:
            F.when(F.lower(F.trim(v)) == "unknown", F.lit("Unknown"))
                .when(F.regexp_extract(v, r"Comments:\s*(.*)", 1) != "",
                    F.regexp_extract(v, r"Comments:\s*(.*)", 1))
                .otherwise(F.lit("Unknown"))
        )
    )
    # clean up
    .drop("violations_arr")
)

df_arrays.select("inspection_id", "business_name",
                 "violation_code",
                 "violation_description",
                 "violation_comments"
).show(5, truncate=False)


+-------------+-------------------------------------------------+--------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
df_arrays.printSchema()


root
 |-- license_number: integer (nullable = true)
 |-- inspection_id: integer (nullable = true)
 |-- business_name: string (nullable = true)
 |-- risk: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- inspection_date: date (nullable = true)
 |-- inspection_type: string (nullable = true)
 |-- result: string (nullable = true)
 |-- violations: string (nullable = true)
 |-- facility_type: string (nullable = true)
 |-- violation_code: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- violation_description: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- violation_comments: array (nullable = true)
 |    |-- element: string (containsNull = false)



Check if the length of arrays in violation code, description and comments column is the same

In [0]:
# compute lengths and a consistency flag
df_check_consistency = df_arrays \
  .withColumn("len_code", F.size("violation_code")) \
  .withColumn("len_desc", F.size("violation_description")) \
  .withColumn("len_comm", F.size("violation_comments")) \
  .withColumn("arrays_consistent",
      (F.col("len_code") == F.col("len_desc")) &
      (F.col("len_code") == F.col("len_comm"))
  )

# view every row’s lengths + consistency
df_check_consistency.select("inspection_id","len_code","len_desc","len_comm","arrays_consistent") \
        .show(10,truncate=False)


+-------------+--------+--------+--------+-----------------+
|inspection_id|len_code|len_desc|len_comm|arrays_consistent|
+-------------+--------+--------+--------+-----------------+
|2589123      |1       |1       |1       |true             |
|2612492      |1       |1       |1       |true             |
|2568082      |14      |14      |14      |true             |
|2570409      |1       |1       |1       |true             |
|2497976      |3       |3       |3       |true             |
|2500224      |20      |20      |20      |true             |
|2576428      |1       |1       |1       |true             |
|2561780      |1       |1       |1       |true             |
|2611510      |1       |1       |1       |true             |
|2605098      |1       |1       |1       |true             |
+-------------+--------+--------+--------+-----------------+
only showing top 10 rows



In [0]:
# count how many mismatches there are
mismatch_count = df_check_consistency.filter(~F.col("arrays_consistent")).count()
print(f"Rows with inconsistent array‐lengths: {mismatch_count}")

Rows with inconsistent array‐lengths: 0


Check how many rows have no code and Unknown description

In [0]:
df_unknown = df_arrays.filter(
    F.expr("exists(violation_description, desc -> desc = 'Unknown')")
)

df_unknown.select("inspection_id", "violation_code", "violation_description").show(truncate=False)

+-------------+--------------+----------------------------------------------------------------------------------------------+
|inspection_id|violation_code|violation_description                                                                         |
+-------------+--------------+----------------------------------------------------------------------------------------------+
|2589123      |[-9999]       |[Unknown]                                                                                     |
|2612492      |[-9999]       |[Unknown]                                                                                     |
|2576428      |[-9999]       |[Unknown]                                                                                     |
|2561780      |[-9999]       |[Unknown]                                                                                     |
|2611510      |[-9999]       |[Unknown]                                                                               

### Flatten the data

In [0]:
df_copy = df_arrays.select("*")

# Create a "pos" column to maintain the relationship between arrays
df_exploded = df_copy.withColumn(
    "pos", 
    F.expr("sequence(0, greatest(size(violation_code), size(violation_description), size(violation_comments)) - 1)")
)

# Explode the position array to create one row per violation
df_exploded = df_exploded.withColumn("pos", F.explode(F.col("pos")))

# Extract the corresponding elements for each position
df_flattened = df_exploded.withColumn(
    "temp_violation_code", 
    F.expr("if(pos < size(violation_code), violation_code[pos], null)")
)

df_flattened = df_flattened.withColumn(
    "temp_violation_description", 
    F.expr("if(pos < size(violation_description), violation_description[pos], null)")
)

df_flattened = df_flattened.withColumn(
    "temp_violation_comments", 
    F.expr("if(pos < size(violation_comments), violation_comments[pos], null)")
)

# Filter out rows where all three values are null
df_flattened = df_flattened.filter(
    (F.col("temp_violation_code").isNotNull()) | 
    (F.col("temp_violation_description").isNotNull()) |
    (F.col("temp_violation_comments").isNotNull())
)

# Get all columns except the ones we're replacing
columns_to_keep = [col for col in df_flattened.columns if col not in 
                   ["pos", "temp_violation_code", "temp_violation_description", 
                    "temp_violation_comments", "violation_code", "violation_description", 
                    "violation_comments"]]

# Create the final DataFrame with the flattened values
df_final = df_flattened.select(
    *columns_to_keep,
    F.col("temp_violation_code").alias("violation_code"),
    F.col("temp_violation_description").alias("violation_description"),
    F.col("temp_violation_comments").alias("violation_comments")
)

df_final.show(truncate=False)

+--------------+-------------+-------------------------------------------------+------+------------------+-------+-----+-------+---------------+---------------+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
df_chicago_food_inspection = df_final.withColumnRenamed("facility_type", "business_type").withColumnRenamed("street_address", "address").withColumnRenamed("result", "results")

In [0]:
# Define the list of columns to keep
final_columns = [
    'inspection_id',
    'business_name',
    'business_type',
    'license_number',
    'address',
    'zipcode',
    'city',
    'state',
    'inspection_type',
    'inspection_date',
    'risk',
    'results',
    'violation_code',
    'violation_description',
    'violation_comments'
    
]

df_chicago_final = df_chicago_food_inspection.select(*final_columns)

df_chicago_final.show(truncate=False)

+-------------+-------------------------------------------------+-----------------------+--------------+------------------+-------+-------+-----+---------------+---------------+------+--------+--------------+---------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|inspection_id|business_name                                    |business_type          |license_number|address           |zipcode|city   |state|inspection_type|inspection_date|risk  |results |violation_code|violation_description                                                            |violation_comments                                                                                                               

In [0]:
df_chicago_final.count()

260047

### Load the cleaned and transformed data to Snowflake Chicago stage table

In [0]:
# Set up Snowflake options
sfOptions = {
    "sfURL": "KOMAXUA-FHA53164.snowflakecomputing.com",  # Snowflake URL (without "https://")
    "sfDatabase": "FOOD_INSPECTION_DB",                 # Snowflake database name
    "sfSchema": "RAW_STAGE_SCHEMA",                     # Snowflake schema name
    "sfWarehouse": "DADABI_WH",                         # Snowflake warehouse name
    "sfRole": "DEVELOPER",                              # Snowflake role (optional)
    "sfUser": "DADABI_USER",                            # Snowflake username
    "sfPassword": "snowflake123#"                       # Snowflake password
}

# Write DataFrame to Snowflake
(
    df_chicago_final.write
    .format("net.snowflake.spark.snowflake")
    .options(**sfOptions)
    .option("dbtable", "STG_CHICAGO")
    .mode("overwrite")
    .save()
)