#### How to add meta data columns in bronze tables using lit()?
- load_date
- execution_date_time

In [0]:
dbutils.widgets.removeAll()

dbutils.widgets.text("SCHEDULE_DATE", "", "SCHEDULE_DATE")

PARAM_SCHEDULE_DATE = dbutils.widgets.get("SCHEDULE_DATE")

print("PARAM_SCHEDULE_DATE: ", PARAM_SCHEDULE_DATE)

PARAM_SCHEDULE_DATE:  2025-04-05


In [0]:
from datetime import datetime,timezone
from pyspark.sql.functions import lit
from pyspark.sql.types import DateType, TimestampType

In [0]:
def get_current_utc_datetime():
    dt = datetime.now(timezone.utc)
    strUTCdt = dt.strftime("%Y-%m-%dT%H:%M:%S")
    return strUTCdt

|  date                             |        conversion                  |
|-----------------------------------|------------------------------------|
|  datetime.now()                   |  2025-08-12 12:27:42.326252        |
|  datetime.now(timezone.utc)       |  2025-08-12 12:34:31.814015+00:00  |
|  dt.strftime("%Y-%m-%dT%H:%M:%S") |  2025-08-12T12:38:32               |

#### 1) initial load
**a) First execution**

In [0]:
full_data_df_first = spark.read.csv("/Volumes/workspace/default/@azureadb/from_unixtime.csv", header=True, inferSchema=True)
display(full_data_df_first)

task_id,Commodity_Index,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id,Input_Timestamp_UTC,Update_Timestamp_UTC
101,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1068,1709109264,1709109264
102,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1071,1710234895,1710234895
103,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1068,1709109264,1709109264
104,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1071,1707813327,1707813327
105,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1068,1707813327,1707813327
106,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1071,1707813327,1707813327
107,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1068,1707813327,1707813327
108,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1071,1707813327,1707813327
109,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1068,1707813327,1707813327
110,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1071,1707813327,1707813327


In [0]:
# full_data_df.show(10,False)
full_data_df_first = full_data_df_first.withColumn("load_date", lit(PARAM_SCHEDULE_DATE).cast(DateType()))
full_data_df_first = full_data_df_first.withColumn("execution_date_time", lit(get_current_utc_datetime()).cast(TimestampType()))

display(full_data_df_first)

task_id,Commodity_Index,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id,Input_Timestamp_UTC,Update_Timestamp_UTC,load_date,execution_date_time
101,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:03:54.000Z
102,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1071,1710234895,1710234895,2025-04-05,2025-08-12T12:03:54.000Z
103,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:03:54.000Z
104,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
105,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
106,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
107,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
108,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
109,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
110,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z


In [0]:
full_data_df_first.createOrReplaceTempView("bronze_table")

In [0]:
%sql
SELECT load_date, execution_date_time, COUNT(*)
FROM bronze_table
GROUP BY load_date, execution_date_time;

load_date,execution_date_time,COUNT(*)
2025-04-05,2025-08-12T12:03:54.000Z,110


**b) Second execution**

In [0]:
full_data_df_second = spark.read.csv("/Volumes/workspace/default/@azureadb/from_unixtime_01.csv", header=True, inferSchema=True)
display(full_data_df_second)

task_id,Commodity_Index,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id,Input_Timestamp_UTC,Update_Timestamp_UTC
211,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1068,1709109264,1709109264
222,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1071,1710234895,1710234895
233,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1068,1709109264,1709109264
244,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1071,1707813327,1707813327
255,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1068,1707813327,1707813327
266,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1071,1707813327,1707813327
277,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1068,1707813327,1707813327
288,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1071,1707813327,1707813327
299,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1068,1707813327,1707813327
310,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1071,1707813327,1707813327


In [0]:
# full_data_df.show(10,False)
full_data_df_second = full_data_df_second.withColumn("load_date", lit(PARAM_SCHEDULE_DATE).cast(DateType()))
full_data_df_second = full_data_df_second.withColumn("execution_date_time", lit(get_current_utc_datetime()).cast(TimestampType()))

display(full_data_df_second)

task_id,Commodity_Index,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id,Input_Timestamp_UTC,Update_Timestamp_UTC,load_date,execution_date_time
211,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:04:00.000Z
222,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1071,1710234895,1710234895,2025-04-05,2025-08-12T12:04:00.000Z
233,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:04:00.000Z
244,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:04:00.000Z
255,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:04:00.000Z
266,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:04:00.000Z
277,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:04:00.000Z
288,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:04:00.000Z
299,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:04:00.000Z
310,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:04:00.000Z


In [0]:
uniondf_sec = full_data_df_first.union(full_data_df_second)
display(uniondf_sec)

task_id,Commodity_Index,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id,Input_Timestamp_UTC,Update_Timestamp_UTC,load_date,execution_date_time
101,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:03:54.000Z
102,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1071,1710234895,1710234895,2025-04-05,2025-08-12T12:03:54.000Z
103,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:03:54.000Z
104,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
105,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
106,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
107,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
108,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
109,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
110,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z


In [0]:
uniondf_sec.createOrReplaceTempView("bronze_table")

In [0]:
%sql
SELECT load_date, execution_date_time, COUNT(*)
FROM bronze_table
GROUP BY load_date, execution_date_time;

load_date,execution_date_time,COUNT(*)
2025-04-05,2025-08-12T12:03:54.000Z,110
2025-04-05,2025-08-12T12:04:00.000Z,121


**incremental load**

In [0]:
full_data_df_third = spark.read.csv("/Volumes/workspace/default/@azureadb/from_unixtime_02.csv", header=True, inferSchema=True)
display(full_data_df_third)

task_id,Commodity_Index,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id,Input_Timestamp_UTC,Update_Timestamp_UTC
1532,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1068,1709109264,1709109264
1533,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1071,1710234895,1710234895
1534,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1068,1709109264,1709109264
1535,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1071,1707813327,1707813327
1536,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1068,1707813327,1707813327
1537,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1071,1707813327,1707813327
1538,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1068,1707813327,1707813327
1539,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1071,1707813327,1707813327
1540,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1068,1707813327,1707813327
1541,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1071,1707813327,1707813327


In [0]:
# full_data_df.show(10,False)
full_data_df_third = full_data_df_third.withColumn("load_date", lit(PARAM_SCHEDULE_DATE).cast(DateType()))
full_data_df_third = full_data_df_third.withColumn("execution_date_time", lit(get_current_utc_datetime()).cast(TimestampType()))

display(full_data_df_third)

task_id,Commodity_Index,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id,Input_Timestamp_UTC,Update_Timestamp_UTC,load_date,execution_date_time
1532,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:04:08.000Z
1533,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1071,1710234895,1710234895,2025-04-05,2025-08-12T12:04:08.000Z
1534,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:04:08.000Z
1535,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:04:08.000Z
1536,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:04:08.000Z
1537,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:04:08.000Z
1538,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:04:08.000Z
1539,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:04:08.000Z
1540,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:04:08.000Z
1541,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:04:08.000Z


In [0]:
uniondf_final = uniondf_sec.union(full_data_df_third)
display(uniondf_final)

task_id,Commodity_Index,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id,Input_Timestamp_UTC,Update_Timestamp_UTC,load_date,execution_date_time
101,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:03:54.000Z
102,DISCOUNT,06-Feb-23,14-Jan-23,06-Feb-23,1500,10,1071,1710234895,1710234895,2025-04-05,2025-08-12T12:03:54.000Z
103,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1068,1709109264,1709109264,2025-04-05,2025-08-12T12:03:54.000Z
104,DISCOUNT,08-Jan-24,07-Oct-23,08-Jan-24,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
105,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
106,DISCOUNT,06-Mar-23,07-Feb-23,06-Mar-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
107,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
108,DISCOUNT,06-Jan-25,09-Jan-24,06-Jan-25,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
109,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1068,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z
110,DISCOUNT,06-Apr-23,07-Mar-23,06-Apr-23,1500,10,1071,1707813327,1707813327,2025-04-05,2025-08-12T12:03:54.000Z


In [0]:
uniondf_final.createOrReplaceTempView("bronze_table")

In [0]:
%sql
SELECT load_date, execution_date_time, COUNT(*)
FROM bronze_table
GROUP BY load_date, execution_date_time;

load_date,execution_date_time,COUNT(*)
2025-04-05,2025-08-12T12:03:54.000Z,110
2025-04-05,2025-08-12T12:04:00.000Z,121
2025-04-05,2025-08-12T12:04:08.000Z,121


In [0]:
%sql
SELECT COUNT(*) FROM bronze_table
WHERE load_date IS NULL;

COUNT(*)
0


In [0]:
%sql
SELECT task_id, COUNT(*)
FROM bronze_table
GROUP BY task_id
HAVING COUNT(*) > 1;

task_id,COUNT(*)


In [0]:
%sql
SELECT COUNT(*) FROM bronze_table
WHERE task_id IS NULL;

COUNT(*)
0
