In [0]:
df_device = spark.table("workspace.bronze.device_messages")
df_steps = spark.table("workspace.bronze.rapid_step_tests")

# Preview tables
display(df_device)
display(df_steps)

In [0]:
from pyspark.sql.functions import regexp_extract, col, lit

# Extract numeric distance from string (e.g., "1cm" â†’ 1)
df_device = df_device.withColumn(
    "distance_cm", regexp_extract(col("distance"), r"(\d+)", 1).cast("int")
)

In [0]:
df_device = df_device.withColumn("source", lit("device"))
df_steps = df_steps.withColumn("source", lit("step"))

In [0]:
df_steps_window = df_steps.select(
    col("device_id"),
    col("start_time"),
    col("stop_time")
)

In [0]:
from pyspark.sql.functions import when

df_labeled = (
    df_device.alias("d")
    .join(
        df_steps_window.alias("s"),
        (col("d.device_id") == col("s.device_id")) &
        (col("d.timestamp").between(col("s.start_time"), col("s.stop_time"))),
        "left"
    )
    .withColumn(
        "step_label",
        when(col("s.start_time").isNotNull(), "step").otherwise("no_step")
    )
)

In [0]:
df_final = df_labeled.select(
    "timestamp",
    "sensor_type",
    "distance_cm",
    "d.device_id",
    "step_label",
    "source"
)

display(df_final)

In [0]:
spark.sql("USE workspace.silver")

# Save as a table for ML, allowing schema overwrite
df_final.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("labeled_step_test")

In [0]:
%sql
-- %sql
SELECT
  step_label,
  COUNT(*) AS row_count
FROM labeled_step_test
GROUP BY step_label;

In [0]:
%sql
-- %sql
SELECT *
FROM labeled_step_test
WHERE step_label NOT IN ('step', 'no_step')
   OR step_label IS NULL
LIMIT 50;

In [0]:
%sql
-- %sql
SELECT
  source,
  COUNT(*) AS row_count
FROM labeled_step_test
GROUP BY source;

In [0]:
%sql
-- %sql
SELECT *
FROM labeled_step_test
WHERE source NOT IN ('device', 'step')
   OR source IS NULL
LIMIT 50;

In [0]:
df_final.write.mode("overwrite").saveAsTable("labeled_step_test")