In [0]:
from pyspark.sql.functions import col, lit, current_timestamp, to_date

In [0]:
spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [0]:
df = spark.read.option('header',True).option('InferSchema',True).csv("/FileStore/tables/MarketPrice-1.csv")
display(df.limit(10))
df.printSchema()
print("Number of Rows:", df.count())

Commodity_Index,Sensex_Id,Sensex_Name,Sensex_Category,Label_Type,Effective_Date,Start_Date,End_Date,Income,Delta_Value,Target_Id
DISCOUNT,4,1,Top,average,6-Feb-23,14-Jan-23,6-Feb-23,1500,10.0,1068
DISCOUNT,4,1,Top,average,6-Feb-23,14-Jan-23,6-Feb-23,1500,10.0,1071
DISCOUNT,9,1,Top,average,8-Jan-24,7-Oct-23,8-Jan-24,1500,10.0,1068
DISCOUNT,9,1,Top,average,8-Jan-24,7-Oct-23,8-Jan-24,1500,10.0,1071
DISCOUNT,5,2,Top,average,6-Mar-23,7-Feb-23,6-Mar-23,1500,10.0,1068
DISCOUNT,5,2,Top,average,6-Mar-23,7-Feb-23,6-Mar-23,1500,10.0,1071
DISCOUNT,10,2,Top,average,6-Jan-25,9-Jan-24,6-Jan-25,1500,10.0,1068
DISCOUNT,10,2,Top,average,6-Jan-25,9-Jan-24,6-Jan-25,1500,10.0,1071
DISCOUNT,6,3,Top,average,6-Apr-23,7-Mar-23,6-Apr-23,1500,10.0,1068
DISCOUNT,6,3,Top,average,6-Apr-23,7-Mar-23,6-Apr-23,1500,10.0,1071


root
 |-- Commodity_Index: string (nullable = true)
 |-- Sensex_Id: integer (nullable = true)
 |-- Sensex_Name: integer (nullable = true)
 |-- Sensex_Category: string (nullable = true)
 |-- Label_Type: string (nullable = true)
 |-- Effective_Date: string (nullable = true)
 |-- Start_Date: string (nullable = true)
 |-- End_Date: string (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Delta_Value: double (nullable = true)
 |-- Target_Id: integer (nullable = true)

Number of Rows: 267


In [0]:
print("Column Names:", df.columns)
print("\nNumber of Columns:", len(df.columns))

Column Names: ['Commodity_Index', 'Sensex_Id', 'Sensex_Name', 'Sensex_Category', 'Label_Type', 'Effective_Date', 'Start_Date', 'End_Date', 'Income', 'Delta_Value', 'Target_Id']

Number of Columns: 11


#### **change data type from string to date (Start_Date, End_Date & Effective_Date)**

In [0]:
df = df.withColumn("Start_Date", to_date(col("Start_Date"), "dd-MMM-yy")) \
       .withColumn("End_Date", to_date(col("End_Date"), "dd-MMM-yy")) \
       .withColumn("Effective_Date", to_date(col("Effective_Date"), "dd-MMM-yy")) \
       .withColumn("created_datetime", current_timestamp()) \
       .withColumn("updated_datetime", current_timestamp()) \
       .withColumn("source_database_id", lit("Azure")) \
       .select(col("Commodity_Index").alias("commodity_index"),
               col("Sensex_Id").alias("sensex_id"),
               col("Sensex_Name").alias("sensex_name"),
               col("Sensex_Category").alias("sensex_category"),
               col("Label_Type").alias("label_type"),
               col("Effective_Date").alias("effective_date"),
               col("Start_Date").alias("start_date"),
               col("End_Date").alias("end_date"),
               col("Income").alias("income"),
               col("Delta_Value").alias("delta_value"),
               col("Target_Id").alias("target_id"),
               col("source_database_id"),
               col("created_datetime"),
               col("updated_datetime")
               )      

display(df.limit(10))
df.printSchema()    

commodity_index,sensex_id,sensex_name,sensex_category,label_type,effective_date,start_date,end_date,income,delta_value,target_id,source_database_id,created_datetime,updated_datetime
DISCOUNT,4,1,Top,average,2023-02-06,2023-01-14,2023-02-06,1500,10.0,1068,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,4,1,Top,average,2023-02-06,2023-01-14,2023-02-06,1500,10.0,1071,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,9,1,Top,average,2024-01-08,2023-10-07,2024-01-08,1500,10.0,1068,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,9,1,Top,average,2024-01-08,2023-10-07,2024-01-08,1500,10.0,1071,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,5,2,Top,average,2023-03-06,2023-02-07,2023-03-06,1500,10.0,1068,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,5,2,Top,average,2023-03-06,2023-02-07,2023-03-06,1500,10.0,1071,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,10,2,Top,average,2025-01-06,2024-01-09,2025-01-06,1500,10.0,1068,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,10,2,Top,average,2025-01-06,2024-01-09,2025-01-06,1500,10.0,1071,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,6,3,Top,average,2023-04-06,2023-03-07,2023-04-06,1500,10.0,1068,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000
DISCOUNT,6,3,Top,average,2023-04-06,2023-03-07,2023-04-06,1500,10.0,1071,Azure,2024-07-06T18:15:13.329+0000,2024-07-06T18:15:13.329+0000


root
 |-- commodity_index: string (nullable = true)
 |-- sensex_id: integer (nullable = true)
 |-- sensex_name: integer (nullable = true)
 |-- sensex_category: string (nullable = true)
 |-- label_type: string (nullable = true)
 |-- effective_date: date (nullable = true)
 |-- start_date: date (nullable = true)
 |-- end_date: date (nullable = true)
 |-- income: integer (nullable = true)
 |-- delta_value: double (nullable = true)
 |-- target_id: integer (nullable = true)
 |-- source_database_id: string (nullable = false)
 |-- created_datetime: timestamp (nullable = false)
 |-- updated_datetime: timestamp (nullable = false)



In [0]:
# drop_cols = ["Price_Effective_Date", "Price_Period_Start_Date", "Price_Period_End_Date"]
# price_point_cols = [col for col in price_point.columns if col not in drop_cols] 

# price_point = price_point.select(
#     *price_point_cols,
#     current_timestamp().alias("created_datetime"),
#     current_timestamp().alias("updated_datetime"),
#     to_date(col("Price_Period_Start_Date"), "dd-MMM-yyyy").alias("Price_Period_Start_Date"),
#     to_date(col("Price_Period_End_Date"), "dd-MMM-yyyy").alias("Price_Period_End_Date"),
#     to_date(col("Price_Effective_Date"), "dd-MMM-yyyy").alias("Price_Effective_Date")
# )

In [0]:
df.createOrReplaceTempView("MarketPrice_tmp")

In [0]:
%sql
SELECT * FROM MarketPrice_tmp LIMIT 10;

commodity_index,sensex_id,sensex_name,sensex_category,label_type,effective_date,start_date,end_date,income,delta_value,target_id,source_database_id,created_datetime,updated_datetime
DISCOUNT,4,1,Top,average,2023-02-06,2023-01-14,2023-02-06,1500,10.0,1068,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,4,1,Top,average,2023-02-06,2023-01-14,2023-02-06,1500,10.0,1071,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,9,1,Top,average,2024-01-08,2023-10-07,2024-01-08,1500,10.0,1068,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,9,1,Top,average,2024-01-08,2023-10-07,2024-01-08,1500,10.0,1071,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,5,2,Top,average,2023-03-06,2023-02-07,2023-03-06,1500,10.0,1068,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,5,2,Top,average,2023-03-06,2023-02-07,2023-03-06,1500,10.0,1071,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,10,2,Top,average,2025-01-06,2024-01-09,2025-01-06,1500,10.0,1068,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,10,2,Top,average,2025-01-06,2024-01-09,2025-01-06,1500,10.0,1071,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,6,3,Top,average,2023-04-06,2023-03-07,2023-04-06,1500,10.0,1068,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000
DISCOUNT,6,3,Top,average,2023-04-06,2023-03-07,2023-04-06,1500,10.0,1071,Azure,2024-07-06T18:16:32.620+0000,2024-07-06T18:16:32.620+0000


In [0]:
%sql
DROP TABLE IF EXISTS tbl_enriched_MarketPrice;
CREATE TABLE tbl_enriched_MarketPrice AS 
SELECT mpt.commodity_index,
       mpt.sensex_id,
       mpt.sensex_name,
       mpt.sensex_category,
       mpt.label_type,
       mpt.effective_date,
       mpt.start_date,
       mpt.end_date,
       mpt.income,
       mpt.delta_value,
       mpt.target_id,
       mpt.source_database_id,
       mpt.created_datetime,
       mpt.updated_datetime
FROM MarketPrice_tmp mpt;

In [0]:
%sql
SELECT * FROM tbl_enriched_MarketPrice;