In [0]:
import dlt
from pyspark.sql.functions import col, weekofyear, row_number, length, lag, when
from pyspark.sql.window import Window
from pyspark.sql import SparkSession, SQLContext, functions as F

In [0]:
dfBasedTable = spark.read.table("ds_goc_bronze_dev.ds_goc_silver_dev.structured_statuspage").cache()

In [0]:
# Step 1: Create a base table with necessary columns and additional calculated columns
@dlt.view(
    comment="This is just a view of the information to be used in the dashboard"
)
def basedTable():
    basedTable = (dfBasedTable.filter((F.col("EmailValidity") == 'valid') & 
                        (F.col("EmailType") == 'outside') & 
                        (F.col("BlankStatus") == 'valid'))
                .select("Email", "FirstName", "LastName", "Notification", 
                        F.col("StatusPageName").alias("productName"), 
                        F.col("StatusPageNameCount").alias("controlProduct"), 
                        "CompanyNameSource", "snapshot_date", 
                        F.weekofyear("snapshot_date").alias("iso_week_num"),
                        F.row_number().over(Window.partitionBy("Email").orderBy("snapshot_date")).alias("row_num"),
                        F.when(F.row_number().over(Window.partitionBy("Email").orderBy("snapshot_date")) == 1, 'New').otherwise('Regular').alias("client_status"))
                .orderBy("snapshot_date"))
    return basedTable

In [0]:
# Step 2: Identify the latest week number
@dlt.view(
    comment="Identify the latest week number"
)
def latest_week():
    df = dfBasedTable
    latest_week = (df.select(F.weekofyear("snapshot_date").alias("iso_week_num"))
                     .distinct()
                     .orderBy(F.col("iso_week_num").desc())
                     .limit(1))
    return latest_week

In [0]:
# Step 3: Identify the previous week number
@dlt.view(
    comment="Identify the previous week number"
)
def previous_week():
    latest_week = dlt.read("latest_week")
    latest_week_num = latest_week.select("iso_week_num").collect()[0][0]
    df = dfBasedTable
    previous_week = (df.select(F.weekofyear("snapshot_date").alias("iso_week_num"))
                       .distinct()
                       .filter(F.weekofyear("snapshot_date") < latest_week_num)
                       .orderBy(F.col("iso_week_num").desc())
                       .limit(1))
    return previous_week

In [0]:
# Step 4: Get emails from the previous week
@dlt.view(
    comment="Get emails from the previous week"
)
def previous_emails():
    previous_week = dlt.read("previous_week")
    previous_week_num = previous_week.select("iso_week_num").collect()[0][0]
    df = dfBasedTable
    previous_emails = (df.filter(F.weekofyear("snapshot_date") == previous_week_num)
                         .select("Email", F.weekofyear("snapshot_date").alias("iso_week_num")))
    return previous_emails

In [0]:
# Step 5: Get emails from the current week
@dlt.view(
    comment="Get emails from the current week"
)
def current_emails():
    latest_week = dlt.read("latest_week")
    latest_week_num = latest_week.select("iso_week_num").collect()[0][0]
    df = dfBasedTable
    current_emails = (df.filter(F.weekofyear("snapshot_date") == latest_week_num)
                        .select("Email", F.weekofyear("snapshot_date").alias("iso_week_num")))
    return current_emails

In [0]:
# Step 6: Identify removed users by comparing previous and current week emails
@dlt.view(
    comment="Identify removed users by comparing previous and current week emails"
)
def removed_users():
    previous_emails = dlt.read("previous_emails")
    current_emails = dlt.read("current_emails")
    removed_users = (previous_emails.join(current_emails, "Email", "left_anti")
                                    .select(F.col("Email"), F.col("iso_week_num").alias("removed_week")))
    return removed_users

In [0]:
# Step 7: Select distinct records from the base table and join with removed users
@dlt.view(
    comment="Select distinct records from the base table and join with removed users"
)
def final_table():
    basedTable = dlt.read("basedTable")
    removed_users = dlt.read("removed_users")
    final_table = (basedTable.join(removed_users, "Email", "left")
                              .select("Email", "FirstName", "LastName", "Notification", "productName", "controlProduct", 
                                      "CompanyNameSource", "iso_week_num", "client_status", "snapshot_date", 
                                      F.when(F.col("removed_week").isNotNull(), "Removed").otherwise("Active").alias("subscription_status"), 
                                      "removed_week")
                              .distinct())
    return final_table

In [0]:
# Step 8: Keep only the earliest status for each week and select the product name with the shortest length
@dlt.view(
    comment="Keep only the earliest status for each week and select the product name with the shortest length"
)
def earliest_status():
    final_table = dlt.read("final_table")
    window_spec = Window.partitionBy("Email", "iso_week_num").orderBy("snapshot_date", F.length("productName"))
    earliest_status = (final_table.withColumn("rn", F.row_number().over(window_spec))
                                   .select("Email", "FirstName", "LastName", "Notification", "controlProduct", 
                                           "CompanyNameSource", "iso_week_num", "client_status", "subscription_status", 
                                           "removed_week", "snapshot_date", "productName", "rn"))
    return earliest_status

In [0]:
# Step 9: Filter to keep only the earliest status for each week
@dlt.view(
    comment="Filter to keep only the earliest status for each week"
)
def cleanSubscriptionStatus():
    earliest_status = dlt.read("earliest_status")
    cleanSubscriptionStatus = (earliest_status.filter(F.col("rn") == 1)
                                             .select("Email", "FirstName", "LastName", "Notification", "productName", 
                                                     "controlProduct", "CompanyNameSource", "iso_week_num", "client_status", 
                                                     "subscription_status", "removed_week"))
    return cleanSubscriptionStatus

In [0]:
@dlt.view(
    comment="Identify the changes happening per week"
)
def changeIdentify():
    cleanSubscriptionStatus = dlt.read("cleanSubscriptionStatus")
    window_spec_change = Window.partitionBy("Email").orderBy("iso_week_num", "productName")
    changeIdentify = (cleanSubscriptionStatus.withColumn("prev_Email", F.lag("Email").over(window_spec_change))
                                          .withColumn("prev_FirstName", F.lag("FirstName").over(window_spec_change))
                                          .withColumn("prev_LastName", F.lag("LastName").over(window_spec_change))
                                          .withColumn("prev_Notification", F.lag("Notification").over(window_spec_change))
                                          .withColumn("prev_productName", F.lag("productName").over(window_spec_change))
                                          .withColumn("change_type", 
                                                      F.when(F.col("Email") != F.lag("Email").over(window_spec_change), "Email")
                                                       .when(F.col("FirstName") != F.lag("FirstName").over(window_spec_change), "FirstName")
                                                       .when(F.col("LastName") != F.lag("LastName").over(window_spec_change), "LastName")
                                                       .when(F.col("Notification") != F.lag("Notification").over(window_spec_change), "Notification")
                                                       .when((F.col("productName") != F.lag("productName").over(window_spec_change)) & 
                                                             (F.length("productName") != F.lag(F.length("productName")).over(window_spec_change)), "productChange")
                                                       .otherwise(None))
                                          .select("Email", "FirstName", "LastName", "Notification", "productName", "controlProduct", 
                                                  "CompanyNameSource", "iso_week_num", "client_status", "subscription_status", 
                                                  "removed_week", "change_type"))
    return changeIdentify

In [0]:
@dlt.table(
    name="curated_statuspage",
    comment="This table transform all users for the subscribers on status page of the applications",
    table_properties={"quality": "gold"}
)

def curated_statuspage():
    changeIdentify = dlt.read("changeIdentify")
    curated_statuspage = (changeIdentify.select(
        "Email", "FirstName", "LastName", "Notification", "productName", "controlProduct", 
        "CompanyNameSource", "iso_week_num", "client_status", "subscription_status", 
        "removed_week", "change_type")
        .orderBy("Email", "iso_week_num"))
    return curated_statuspage