In [0]:
import os

client_id = os.environ.get('client_id')
tenant_id = os.environ.get('tenant_id')
client_secret = os.environ.get('secret_value')
storage_account = "project1azure1"

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

Reading Data from adls

In [0]:
df_event = spark.read.format("delta").load("abfss://silver@project1azure1.dfs.core.windows.net/sales/events", header=True)
df_item_properties = spark.read.format("delta").load("abfss://silver@project1azure1.dfs.core.windows.net/inventory/item_properties", header=True)

In [0]:
from pyspark.sql.functions import col, when, count, lit, max, round 
from pyspark.sql.types import IntegerType

category_df = df_item_properties.filter(col("property") == "categoryid")


#Current category column
latest_category_df = category_df.groupBy("itemid").agg(max(col("timestamp")).alias("latest_timestamp")).join(category_df, "itemid").filter(col("timestamp") == col("latest_timestamp")).select("itemid", "value").withColumnRenamed("value", "current_category_id")


availabilty_df = df_item_properties.filter(col("property") == "available")

#Current availaibilty column
latest_availability_df = availabilty_df.groupBy("itemid").agg(max(col("timestamp")).alias("latest_timestamp")).join(availabilty_df, "itemid").filter(col("timestamp") == col("latest_timestamp")).select("itemid", "value").withColumnRenamed("value", "current_availability")


#Total views,Total addtocart and total transaction columns
event_counts = df_event.groupby("itemid", "event").agg(count("*").alias("count"))

pivot_df = event_counts.groupby("itemid").pivot("event",["view","addtocart","transaction"]).sum("count").na.fill(0).withColumnRenamed("view", "total_views").withColumnRenamed("addtocart", "total_addtocarts").withColumnRenamed("transaction", "total_purchases")

dim_item = pivot_df.join(latest_category_df, "itemid","left").join(latest_availability_df, "itemid","left")

#Conversion rate column
dim_item = dim_item.withColumn("Conversion_rate",when(col("total_views") == 0,lit(0.0)).otherwise(round(col("total_purchases")/col("total_views"),4)))

#latest timestamp column
latest_update = df_item_properties.groupBy("itemid").agg(max(col("timestamp")).alias("latest_timestamp"))
dim_item = dim_item.join(latest_update, "itemid","left")

display(dim_item)








itemid,total_views,total_addtocarts,total_purchases,current_category_id,current_availability,Conversion_rate,latest_timestamp
91,3,0,0,209.0,0.0,0.0,2015-05-31T03:00:00Z
128,4,0,0,,,0.0,
193,2,0,0,1528.0,0.0,0.0,2015-09-13T03:00:00Z
210,1,0,0,421.0,0.0,0.0,2015-05-31T03:00:00Z
251,5,0,0,342.0,0.0,0.0,2015-06-28T03:00:00Z
375,8,0,0,1173.0,0.0,0.0,2015-05-10T03:00:00Z
412,21,0,0,1018.0,1.0,0.0,2015-09-13T03:00:00Z
417,10,0,0,,,0.0,
481,13,0,0,1192.0,0.0,0.0,2015-05-31T03:00:00Z
496,124,5,2,707.0,0.0,0.0161,2015-09-13T03:00:00Z


Converting long type to integer type

In [0]:
cols_to_convert = ["total_views", "total_addtocarts", "total_purchases"]

for column_name in cols_to_convert:
    dim_item = dim_item.withColumn(column_name, col(column_name).cast(IntegerType()))

dim_item.display()

itemid,total_views,total_addtocarts,total_purchases,current_category_id,current_availability,Conversion_rate,latest_timestamp
91,3,0,0,209.0,0.0,0.0,2015-05-31T03:00:00Z
128,4,0,0,,,0.0,
193,2,0,0,1528.0,0.0,0.0,2015-09-13T03:00:00Z
210,1,0,0,421.0,0.0,0.0,2015-05-31T03:00:00Z
251,5,0,0,342.0,0.0,0.0,2015-06-28T03:00:00Z
375,8,0,0,1173.0,0.0,0.0,2015-05-10T03:00:00Z
412,21,0,0,1018.0,1.0,0.0,2015-09-13T03:00:00Z
417,10,0,0,,,0.0,
481,13,0,0,1192.0,0.0,0.0,2015-05-31T03:00:00Z
496,124,5,2,707.0,0.0,0.0161,2015-09-13T03:00:00Z


Saving data to adls

In [0]:
dim_item.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("abfss://gold@project1azure1.dfs.core.windows.net/dim_item")
