In [0]:
from pyspark.sql.functions import col, monotonically_increasing_id, concat_ws, lit, floor, rand, expr, current_date, date_sub, when
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

# Start Spark session (Databricks auto-creates this for you)
spark = SparkSession.builder.getOrCreate()

# Load the table from Unity Catalog with a filter for fiscal year > 2020
df = spark.read.table("workspace.default.mine_accidents")
df.show(5)



In [0]:
# Equipment ID
df = df.withColumn("equipment_id", concat_ws("-", lit("EQP"), F.date_format(current_date(), "yyyyMMdd"), (monotonically_increasing_id() % 1000)))

# Equipment Age
df = df.withColumn("equipment_age_yrs", floor(rand(seed=42) * 15 + 1))


# Days since maintenance
df = df.withColumn("last_maintenance_days", floor(rand(seed=99) * 150 + 30))



In [0]:
# Ensure the cast is explicitly done
df = df.withColumn("last_maintenance_dt", date_sub(col("accident_dt"), col("last_maintenance_days").cast("int")))
# Warranty
df = df.withColumn("is_under_warranty", when(col("equipment_age_yrs") < 3, True).otherwise(False))

# Failure Probability
df = df.withColumn("failure_probability", (col("equipment_age_yrs") / 15 + rand(seed=88) * 0.1).cast("double"))

# Drop intermediate
df = df.drop("last_maintenance_days")

In [0]:
df.show(5)

In [0]:
# Drop the existing table if it exists
spark.sql("DROP TABLE IF EXISTS workspace.default.mine_accidents")

# Write the DataFrame to the mine_accidents table
df.write.mode("overwrite").saveAsTable("workspace.default.mine_accidents")