In [0]:
%run /Workspace/Users/appujack799@gmail.com/azure_data_factory_assignment/src/bronze_to_silver/utils

In [0]:
from pyspark.sql.functions import col

silver_base_path = "abfss://silver@adfassignment07.dfs.core.windows.net/sales-view"
# gold_output_path = "abfss://gold@adfassignment07.dfs.core.windows.net/sales_view/store_product_sales_analysis"
gold_output_path = "abfss://unity-catalog-storage@dbstoragemfqavtqukmr2u.dfs.core.windows.net/1680618956163528/sales_view/store_product_sales_analysis"

sales_df = read_delta_with_snake_case(spark, f"{silver_base_path}/customer_sales")
product_df = read_delta_with_snake_case(spark, f"{silver_base_path}/product")
store_df = read_delta_with_snake_case(spark, f"{silver_base_path}/store")

sales_df = sales_df.withColumnRenamed("product__id", "product_id")

store_product_df = get_store_product_data(product_df, store_df)

final_df = enrich_sales_with_store_product(sales_df, store_product_df)

duplicate_cols = [col_name for col_name in final_df.columns if final_df.columns.count(col_name) > 1]
if duplicate_cols:
    print(f"Duplicate Columns Detected: {duplicate_cols}")

final_df = final_df.drop(*set(duplicate_cols[1:]))

selected_cols = [
    "order_date", "category", "city", "customer_id", "order_id", "product_id", "profit", "region", "sales", "segment",
    "ship_date", "ship_mode", "latitude", "longitude",
    "store_name", "location", "manager_name", "product_name", "price", "stock_quantity", "image_url"
]

selected_cols = [col for col in selected_cols if col in final_df.columns]

final_df = final_df.select(*selected_cols)
final_df.display()

final_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(gold_output_path)

spark.sql("DROP TABLE IF EXISTS store_product_sales_analysis")
spark.sql(f"""
    CREATE TABLE store_product_sales_analysis
    USING DELTA
    LOCATION '{gold_output_path}'
""")
