In [0]:
import os

client_id = os.environ.get('client_id')
tenant_id = os.environ.get('tenant_id')
client_secret = os.environ.get('secret_value')
storage_account = "project1azure1"

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
df = spark.read.format("delta").load("abfss://silver@project1azure1.dfs.core.windows.net/sales/events")
df.display()

timestamp,visitorid,event,itemid,transactionid
2015-09-14T06:27:23Z,404835,view,432902,
2015-09-15T03:50:51Z,409610,view,112239,
2015-09-14T17:53:57Z,1348864,view,286804,
2015-09-14T17:37:16Z,1310032,view,409655,
2015-09-14T16:51:04Z,135727,view,42519,
2015-09-14T17:02:29Z,17738,view,118382,
2015-09-14T16:39:35Z,560891,view,400946,
2015-09-14T21:36:34Z,401658,view,81998,
2015-09-15T04:53:57Z,413162,view,131015,
2015-09-15T00:59:33Z,162285,view,361463,


In [0]:
from pyspark.sql.functions import col, min, max, sum, when, count, lit, datediff, current_date, trim
from pyspark.sql.types import IntegerType

df = df.withColumn("event", trim(col("event")))

# firstvisitdate,lastvisitdate,totalviews,totaladdtocarts,totalpurchases,totalevents columns
dim_visitor = df.groupBy("visitorid").agg(
    min("timestamp").alias("FirstVisitDate"),
    max("timestamp").alias("LastVisitDate"),
    sum(when(col("event") == "view", 1).otherwise(0)).alias("TotalViews"),
    sum(when(col("event") == "addtocart", 1).otherwise(0)).alias("TotalAddToCarts"),
    sum(when(col("event") == "transaction", 1).otherwise(0)).alias("TotalPurchases"),
    count("*").alias("TotalEvents")
)

# ConversionRate 
dim_visitor = dim_visitor.withColumn(
    "ConversionRate",
    when(col("TotalViews") > 0, col("TotalPurchases") / col("TotalViews")).otherwise(lit(0.0))
)



# Rename VisitorID column
dim_visitor = dim_visitor.withColumnRenamed("visitorid", "VisitorID")

# Final columns 
dim_visitor = dim_visitor.select(
    "VisitorID", "FirstVisitDate", "LastVisitDate",
    "TotalViews", "TotalAddToCarts", "TotalPurchases",
    "ConversionRate","TotalEvents"
)

dim_visitor =dim_visitor.withColumn("IsValidFunnel", 
    when((col("TotalViews") <= col("TotalAddToCarts")) | (col("TotalViews") <= col("TotalPurchases")), lit(False))
    .otherwise(lit(True))
)

dim_visitor = dim_visitor.withColumn("ConversionRate", col("ConversionRate").cast("decimal(5,4)"))

dim_visitor.where(col("IsValidFunnel") == True).sort(col("ConversionRate"), ascending=False).display()

'''problematic = dim_visitor.filter((col("TotalViews") < col("TotalAddToCarts")) | (col("Totalviews") < col("Totalpurchases")))

total = dim_visitor.count()
invalid = problematic.count()
print(f"⚠️ Invalid Visitor rows: {invalid} / {total} = {invalid/total:.2%}") '''



VisitorID,FirstVisitDate,LastVisitDate,TotalViews,TotalAddToCarts,TotalPurchases,ConversionRate,TotalEvents,IsValidFunnel
219788,2015-05-19T22:47:28Z,2015-07-14T19:40:29Z,15,10,13,0.8667,38,True
1168431,2015-06-09T23:48:45Z,2015-06-16T19:56:37Z,7,5,6,0.8571,18,True
240998,2015-05-16T16:20:55Z,2015-08-18T20:24:10Z,19,18,16,0.8421,53,True
728718,2015-09-13T17:31:23Z,2015-09-13T18:13:01Z,6,5,5,0.8333,16,True
139850,2015-06-04T22:27:24Z,2015-08-25T06:46:21Z,6,5,5,0.8333,16,True
1083786,2015-05-14T21:25:38Z,2015-05-14T21:30:58Z,6,2,5,0.8333,13,True
1104957,2015-08-02T19:23:45Z,2015-08-02T19:55:55Z,5,3,4,0.8,12,True
497100,2015-08-25T19:33:07Z,2015-08-26T14:54:35Z,5,2,4,0.8,11,True
783756,2015-06-23T02:33:12Z,2015-06-23T02:54:46Z,5,4,4,0.8,13,True
374805,2015-09-03T13:25:02Z,2015-09-03T13:43:29Z,5,4,4,0.8,13,True


'problematic = dim_visitor.filter((col("TotalViews") < col("TotalAddToCarts")) | (col("Totalviews") < col("Totalpurchases")))\n\ntotal = dim_visitor.count()\ninvalid = problematic.count()\nprint(f"⚠️ Invalid Visitor rows: {invalid} / {total} = {invalid/total:.2%}") '

In [0]:
cols = ["TotalViews", "TotalAddToCarts", "TotalPurchases", "TotalEvents"]

for columnname in cols:
  dim_visitor = dim_visitor.withColumn(columnname, col(columnname).cast(IntegerType()))



In [0]:
dim_visitor.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .save("abfss://gold@project1azure1.dfs.core.windows.net/dim_visitor")
