In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import logging
from datetime import datetime

# Build the Spark Session using AWS Keys from Databricks Secrets
spark = SparkSession.builder \
    .appName("PsychoBunny-DataIngestion") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.access.key", dbutils.secrets.get(scope="aws-keys", key="aws-access-key")) \
    .config("spark.hadoop.fs.s3a.secret.key", dbutils.secrets.get(scope="aws-keys", key="aws-secret-key")) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Spark session initialized with Delta Lake and AWS S3 support")


INFO:__main__:Spark session initialized with Delta Lake and AWS S3 support


In [0]:
access = dbutils.secrets.get("aws-keys", key="aws-access-key")
secret = dbutils.secrets.get("aws-keys", key="aws-secret-key")

spark.conf.set("fs.s3a.access.key", access)
spark.conf.set("fs.s3a.secret.key", secret)
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")    

In [0]:
# Data paths
RAW_DATA_PATH = "s3://psycho-bunny-data-lake/raw-data/"
PROCESSED_DATA_PATH = "s3://psycho-bunny-data-lake/processed-data/"

logger.info("Analytics Requirements notebook initialized")

# Load processed data from Delta Lake
try:
    fact_transactions = spark.read.format("delta").load(f"{PROCESSED_DATA_PATH}fact_transactions")
    dim_customer = spark.read.format("delta").load(f"{PROCESSED_DATA_PATH}dim_customer")
    dim_product = spark.read.format("delta").load(f"{PROCESSED_DATA_PATH}dim_product")
    calendar_df = spark.read.format("delta").load(f"{RAW_DATA_PATH}calendar")
    
    logger.info(f"Loaded data: {fact_transactions.count()} transactions, {dim_customer.count()} customers")
    
except Exception as e:
    logger.error(f"Error loading data: {str(e)}")
    raise


INFO:__main__:Analytics Requirements notebook initialized
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Loaded data: 2823 transactions, 2000 customers


In [0]:
# 2. WEEKLY SALES ANALYSIS
print("WEEKLY SALES ANALYSIS")
print("=" * 40)

weekly_sales = fact_transactions.filter(col("transaction_type") == "SALE") \
    .withColumn("week", weekofyear(col("order_date"))) \
    .withColumn("year", year(col("order_date"))) \
    .groupBy("year", "week") \
    .agg(
        sum("net_amount").alias("total_sales"),
        count("order_number").alias("total_orders"),
        avg("net_amount").alias("avg_order_value")
    ).withColumn("created_date", current_timestamp())

logger.info(f"Weekly Sales Analytics: {weekly_sales.count()} records")
display(weekly_sales.orderBy(desc("total_sales")).limit(30))

print("\nMONTHLY SALES ANALYSIS")
print("=" * 40)

monthly_sales = fact_transactions.filter(col("transaction_type") == "SALE") \
    .withColumn("month", month(col("order_date"))) \
    .withColumn("year", year(col("order_date"))) \
    .groupBy("year", "month") \
    .agg(
        sum("net_amount").alias("total_sales"),
        count("order_number").alias("total_orders"),
        avg("net_amount").alias("avg_order_value")
    ).withColumn("created_date", current_timestamp())

logger.info(f"Monthly Sales Analytics: {monthly_sales.count()} records")
display(monthly_sales.orderBy(desc("total_sales")))

print("\nQUARTERLY SALES ANALYSIS")
print("=" * 40)

quarterly_sales = fact_transactions.filter(col("transaction_type") == "SALE") \
    .withColumn("quarter", quarter(col("order_date"))) \
    .withColumn("year", year(col("order_date"))) \
    .groupBy("year", "quarter") \
    .agg(
        sum("net_amount").alias("total_sales"),
        count("order_number").alias("total_orders"),
        avg("net_amount").alias("avg_order_value")
    ).withColumn("created_date", current_timestamp())

logger.info(f"Quarterly Sales Analytics: {quarterly_sales.count()} records")
display(quarterly_sales.orderBy(desc("total_sales")))


WEEKLY SALES ANALYSIS


INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Weekly Sales Analytics: 110 records
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


year,week,total_sales,total_orders,avg_order_value,created_date
2004,45,58004.0,62,935.548387096774,2025-06-23T17:06:07.442042Z
2003,46,52174.0,48,1086.9583333333333,2025-06-23T17:06:07.442042Z
2004,42,48348.0,56,863.3571428571429,2025-06-23T17:06:07.442042Z
2004,30,35311.0,33,1070.030303030303,2025-06-23T17:06:07.442042Z
2003,45,33715.0,34,991.6176470588236,2025-06-23T17:06:07.442042Z
2003,43,33209.0,29,1145.1379310344828,2025-06-23T17:06:07.442042Z
2004,47,32605.0,40,815.125,2025-06-23T17:06:07.442042Z
2003,48,32421.0,28,1157.892857142857,2025-06-23T17:06:07.442042Z
2004,34,31049.0,29,1070.655172413793,2025-06-23T17:06:07.442042Z
2004,49,30375.0,31,979.8387096774194,2025-06-23T17:06:07.442042Z



MONTHLY SALES ANALYSIS


INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Monthly Sales Analytics: 29 records
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


year,month,total_sales,total_orders,avg_order_value,created_date
2003,11,145791.0,141,1033.978723404255,2025-06-23T17:06:08.331104Z
2004,11,142661.0,157,908.6687898089172,2025-06-23T17:06:08.331104Z
2003,10,85831.0,75,1144.4133333333334,2025-06-23T17:06:08.331104Z
2004,10,75741.0,82,923.670731707317,2025-06-23T17:06:08.331104Z
2004,8,69025.0,65,1061.923076923077,2025-06-23T17:06:08.331104Z
2005,5,64910.0,62,1046.9354838709678,2025-06-23T17:06:08.331104Z
2004,12,59915.0,58,1033.0172413793102,2025-06-23T17:06:08.331104Z
2004,6,53248.0,47,1132.936170212766,2025-06-23T17:06:08.331104Z
2005,1,53057.0,48,1105.3541666666667,2025-06-23T17:06:08.331104Z
2005,3,49675.0,56,887.0535714285714,2025-06-23T17:06:08.331104Z



QUARTERLY SALES ANALYSIS


INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Quarterly Sales Analytics: 10 records
INFO:py4j.clientserver:Received command c on object id p0


year,quarter,total_sales,total_orders,avg_order_value,created_date
2004,4,278317.0,297,937.094276094276,2025-06-23T17:06:09.14562Z
2003,4,258709.0,244,1060.282786885246,2025-06-23T17:06:09.14562Z
2004,3,160235.0,154,1040.487012987013,2025-06-23T17:06:09.14562Z
2005,1,143703.0,146,984.2671232876712,2025-06-23T17:06:09.14562Z
2004,1,110779.0,113,980.3451327433628,2025-06-23T17:06:09.14562Z
2004,2,106350.0,108,984.7222222222222,2025-06-23T17:06:09.14562Z
2003,3,97203.0,100,972.03,2025-06-23T17:06:09.14562Z
2005,2,86073.0,83,1037.0240963855422,2025-06-23T17:06:09.14562Z
2003,2,82079.0,75,1094.3866666666668,2025-06-23T17:06:09.14562Z
2003,1,72019.0,64,1125.296875,2025-06-23T17:06:09.14562Z


In [0]:
# 3. WEEKLY REFUNDS ANALYSIS
print("WEEKLY REFUNDS ANALYSIS")
print("=" * 40)

weekly_refunds = fact_transactions.filter(col("transaction_type") == "REFUND") \
    .withColumn("week", weekofyear(col("order_date"))) \
    .withColumn("year", year(col("order_date"))) \
    .groupBy("year", "week") \
    .agg(
        sum("net_amount").alias("total_refunds"),
        sum("restocking_fee").alias("total_restocking_fees"),
        count("order_number").alias("total_refund_orders")
    ).withColumn("created_date", current_timestamp())

logger.info(f"Weekly Refunds Analytics: {weekly_refunds.count()} records")
display(weekly_refunds.limit(30))

print("\nMONTHLY REFUNDS ANALYSIS")
print("=" * 40)

# Monthly Refunds
monthly_refunds = fact_transactions.filter(col("transaction_type") == "REFUND") \
    .withColumn("month", month(col("order_date"))) \
    .withColumn("year", year(col("order_date"))) \
    .groupBy("year", "month") \
    .agg(
        sum("net_amount").alias("total_refunds"),
        sum("restocking_fee").alias("total_restocking_fees"),
        count("order_number").alias("total_refund_orders")
    ).withColumn("created_date", current_timestamp())

logger.info(f"Monthly Refunds Analytics: {monthly_refunds.count()} records")
display(monthly_refunds)

print("\nQUARTERLY REFUNDS ANALYSIS")
print("=" * 40)

# Quaterly Refunds
quarterly_refunds = fact_transactions.filter(col("transaction_type") == "REFUND") \
    .withColumn("quarter", quarter(col("order_date"))) \
    .withColumn("year", year(col("order_date"))) \
    .groupBy("year", "quarter") \
    .agg(
        sum("net_amount").alias("total_refunds"),
        sum("restocking_fee").alias("total_restocking_fees"),
        count("order_number").alias("total_refund_orders")
    ).withColumn("created_date", current_timestamp())

logger.info(f"Quarterly Refunds Analytics: {quarterly_refunds.count()} records")
display(quarterly_refunds)


INFO:py4j.clientserver:Received command c on object id p0


WEEKLY REFUNDS ANALYSIS


INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Weekly Refunds Analytics: 113 records
INFO:py4j.clientserver:Received command c on object id p0


year,week,total_refunds,total_restocking_fees,total_refund_orders,created_date
2005,5,15950.7,1772.3,18,2025-06-23T17:04:45.255299Z
2005,22,13020.3,1446.7,14,2025-06-23T17:04:45.255299Z
2005,19,10662.3,1184.7,15,2025-06-23T17:04:45.255299Z
2004,44,2725.2,302.8,3,2025-06-23T17:04:45.255299Z
2003,50,6497.099999999999,721.9,9,2025-06-23T17:04:45.255299Z
2004,6,4092.3,454.7000000000001,5,2025-06-23T17:04:45.255299Z
2003,32,12601.799999999996,1400.2,17,2025-06-23T17:04:45.255299Z
2004,38,10630.799999999996,1181.2000000000005,14,2025-06-23T17:04:45.255299Z
2003,17,1200.6,133.4,1,2025-06-23T17:04:45.255299Z
2005,10,3687.3,409.7000000000001,4,2025-06-23T17:04:45.255299Z


INFO:py4j.clientserver:Received command c on object id p0



MONTHLY REFUNDS ANALYSIS


INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Monthly Refunds Analytics: 29 records
INFO:py4j.clientserver:Received command c on object id p0


year,month,total_refunds,total_restocking_fees,total_refund_orders,created_date
2005,5,49569.3,5507.700000000001,58,2025-06-23T17:04:46.165554Z
2004,6,38100.59999999999,4233.400000000001,38,2025-06-23T17:04:46.165554Z
2003,2,14669.100000000002,1629.9,17,2025-06-23T17:04:46.165554Z
2004,8,63828.900000000016,7092.100000000002,68,2025-06-23T17:04:46.165554Z
2003,10,77291.99999999999,8588.000000000002,83,2025-06-23T17:04:46.165554Z
2004,10,63161.100000000006,7017.899999999999,77,2025-06-23T17:04:46.165554Z
2003,3,24702.300000000003,2744.7000000000003,28,2025-06-23T17:04:46.165554Z
2003,9,27747.0,3083.000000000001,35,2025-06-23T17:04:46.165554Z
2003,12,34258.5,3806.5,42,2025-06-23T17:04:46.165554Z
2003,1,21039.3,2337.7,21,2025-06-23T17:04:46.165554Z


INFO:py4j.clientserver:Received command c on object id p0



QUARTERLY REFUNDS ANALYSIS


INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Quarterly Refunds Analytics: 10 records
INFO:py4j.clientserver:Received command c on object id p0


year,quarter,total_refunds,total_restocking_fees,total_refund_orders,created_date
2003,2,84377.69999999998,9375.3,87,2025-06-23T17:04:47.070367Z
2003,3,65600.1,7288.9,84,2025-06-23T17:04:47.070367Z
2003,1,60410.69999999998,6712.300000000002,66,2025-06-23T17:04:47.070367Z
2004,4,235431.9000000001,26159.1,273,2025-06-23T17:04:47.070367Z
2005,1,147341.6999999999,16371.3,156,2025-06-23T17:04:47.070367Z
2004,2,105187.5,11687.5,115,2025-06-23T17:04:47.070367Z
2003,4,251306.09999999977,27922.900000000005,280,2025-06-23T17:04:47.070367Z
2004,1,110581.2,12286.799999999996,120,2025-06-23T17:04:47.070367Z
2004,3,150656.39999999985,16739.60000000001,165,2025-06-23T17:04:47.070367Z
2005,2,78832.8,8759.200000000003,93,2025-06-23T17:04:47.070367Z


In [0]:
# 3. Product Family Analysis - Extract from product codes
def extract_product_family(product_code):
    """Extract product family from product code"""
    if product_code and "_" in product_code:
        return product_code.split("_")[0]
    return "UNKNOWN"

extract_family_udf = udf(extract_product_family, StringType())

# Product family sales analysis
product_family_sales = fact_transactions.filter(col("transaction_type") == "SALE") \
    .withColumn("product_family", extract_family_udf(col("product_code"))) \
    .groupBy("product_family") \
    .agg(
        sum("net_amount").alias("total_sales"),
        count("order_number").alias("total_orders"),
        avg("net_amount").alias("avg_order_value")
    ).orderBy(desc("total_sales")) \
    .withColumn("created_date", current_timestamp())

logger.info(f"Product Family Sales: {product_family_sales.count()} families")
display(product_family_sales)


INFO:py4j.clientserver:Received command c on object id p1
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Product Family Sales: 8 families
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


product_family,total_sales,total_orders,avg_order_value,created_date
S18,486977.0,481,1012.4261954261956,2025-06-23T06:08:45.444208Z
S24,364491.0,361,1009.6703601108034,2025-06-23T06:08:45.444208Z
S700,150848.0,152,992.421052631579,2025-06-23T06:08:45.444208Z
S12,139705.0,138,1012.355072463768,2025-06-23T06:08:45.444208Z
S32,102463.0,107,957.5981308411216,2025-06-23T06:08:45.444208Z
S10,72432.0,70,1034.7428571428572,2025-06-23T06:08:45.444208Z
S50,46470.0,48,968.125,2025-06-23T06:08:45.444208Z
S72,32081.0,27,1188.1851851851852,2025-06-23T06:08:45.444208Z


In [0]:
from pyspark.sql.window import Window

# 4. Best-selling and Second Best-selling Items by Region
regional_product_sales = fact_transactions.filter(col("transaction_type") == "SALE") \
    .withColumn("product_family", extract_family_udf(col("product_code"))) \
    .groupBy("territory", "product_code", "product_family") \
    .agg(
        sum("net_amount").alias("total_sales"),
        count("order_number").alias("total_orders")
    )

# Window function to rank products by sales within each territory
window_spec = Window.partitionBy("territory").orderBy(desc("total_sales"))

regional_rankings = regional_product_sales.withColumn(
    "rank", row_number().over(window_spec)
).filter(col("rank") <= 2) \
.withColumn("ranking_type", 
    when(col("rank") == 1, "Best Selling")
    .when(col("rank") == 2, "Second Best Selling")
    .otherwise("Other")
).withColumn("created_date", current_timestamp())

logger.info(f"Regional Rankings: {regional_rankings.count()} records")
display(regional_rankings)


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Regional Rankings: 8 records
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


territory,product_code,product_family,total_sales,total_orders,rank,ranking_type,created_date
APAC,S24_1937,S24,4363.0,5,1,Best Selling,2025-06-23T06:09:06.995353Z
APAC,S18_4409,S18,4164.0,3,2,Second Best Selling,2025-06-23T06:09:06.995353Z
EMEA,S18_4522,S18,15670.0,12,1,Best Selling,2025-06-23T06:09:06.995353Z
EMEA,S18_3232,S18,14313.0,13,2,Second Best Selling,2025-06-23T06:09:06.995353Z
Japan,S18_4027,S18,4729.0,3,1,Best Selling,2025-06-23T06:09:06.995353Z
Japan,S12_3891,S12,4002.0,3,2,Second Best Selling,2025-06-23T06:09:06.995353Z
,S12_4473,S12,11481.0,10,1,Best Selling,2025-06-23T06:09:06.995353Z
,S12_2823,S12,10578.0,11,2,Second Best Selling,2025-06-23T06:09:06.995353Z


In [0]:
# 5. Revenue Difference Between Best and Second Best Items per Region
revenue_difference = regional_rankings.groupBy("territory") \
    .agg(
        max(when(col("rank") == 1, col("total_sales"))).alias("best_selling_revenue"),
        max(when(col("rank") == 2, col("total_sales"))).alias("second_best_revenue")
    ).withColumn(
        "revenue_difference", 
        col("best_selling_revenue") - col("second_best_revenue")
    ).withColumn(
        "revenue_difference_pct",
        round((col("revenue_difference") / col("best_selling_revenue")) * 100, 2)
    ).withColumn("created_date", current_timestamp())

logger.info(f"Revenue Difference Analysis: {revenue_difference.count()} territories")
display(revenue_difference)


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Revenue Difference Analysis: 4 territories
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


territory,best_selling_revenue,second_best_revenue,revenue_difference,revenue_difference_pct,created_date
APAC,4363.0,4164.0,199.0,4.56,2025-06-23T06:09:24.73378Z
EMEA,15670.0,14313.0,1357.0,8.66,2025-06-23T06:09:24.73378Z
Japan,4729.0,4002.0,727.0,15.37,2025-06-23T06:09:24.73378Z
,11481.0,10578.0,903.0,7.87,2025-06-23T06:09:24.73378Z


In [0]:
# 6. Enhanced Customer Segmentation (High, Medium, Low Value)
customer_metrics = fact_transactions.filter(col("transaction_type") == "SALE") \
    .groupBy("customer_name") \
    .agg(
        sum("net_amount").alias("total_spent"),
        count("order_number").alias("total_orders"),
        avg("net_amount").alias("avg_order_value"),
        max("order_date").alias("last_order_date"),
        countDistinct("product_code").alias("unique_products_purchased")
    )

# Calculate percentiles for segmentation
percentiles = customer_metrics.select(
    expr("percentile_approx(total_spent, 0.33)").alias("p33"),
    expr("percentile_approx(total_spent, 0.67)").alias("p67")
).collect()[0]

p33_threshold = percentiles["p33"]
p67_threshold = percentiles["p67"]

enhanced_customer_segments = customer_metrics.withColumn(
    "customer_segment",
    when(col("total_spent") >= p67_threshold, "High Value")
    .when(col("total_spent") >= p33_threshold, "Medium Value")
    .otherwise("Low Value")
).withColumn("created_date", current_timestamp())

logger.info(f"Enhanced Customer Segments: {enhanced_customer_segments.count()} customers")
logger.info(f"Thresholds: Low: <${p33_threshold:.2f}, Medium: ${p33_threshold:.2f}-${p67_threshold:.2f}, High: >${p67_threshold:.2f}")
display(enhanced_customer_segments.groupBy("customer_segment").count())


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Enhanced Customer Segments: 91 customers
INFO:__main__:Thresholds: Low: <$9147.00, Medium: $9147.00-$16590.00, High: >$16590.00
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


customer_segment,count
Medium Value,30
High Value,31
Low Value,30


In [0]:
# 7. Top 10 Customers by Spending with Contact Details (Using Fact Transactions)
top_customers_by_spending = fact_transactions.filter(col("transaction_type") == "SALE") \
    .groupBy("customer_name", "phone_number","address") \
    .agg(
        sum("net_amount").alias("total_spent"),
        count("order_number").alias("total_orders"),
        avg("net_amount").alias("avg_order_value")
    ).join(
        dim_customer.select("full_name", "email", "phone", "city", "state"),
        trim(upper(col("customer_name"))) == trim(upper(col("full_name"))),
        "left"
    ).select(
        "customer_name", "phone_number","address",
        "total_spent", "total_orders", "avg_order_value"
    ).orderBy(desc("total_spent")).limit(10)

logger.info(f"Top 10 Customers by Spending: {top_customers_by_spending.count()} customers")
display(top_customers_by_spending)

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Top 10 Customers by Spending: 10 customers
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


customer_name,phone_number,address,total_spent,total_orders,avg_order_value
Diego Freyre,(91) 555 94 44,"C/ Moralzarzal, 86",131216.0,129,1017.1782945736434
Valarie Nelson,4155551450,5677 Strong St.,96701.0,95,1017.9052631578948
Adrian Huxley,#ERROR!,"Monitor Money Building, 815 Pacific Hwy",31249.0,27,1157.3703703703704
Kwai Yu,2125557818,897 Long Airport Avenue,29965.0,24,1248.5416666666667
Eric Natividad,#ERROR!,"Bronz Sok., Bronz Apt. 3/6 Tesvikiye",28614.0,23,1244.0869565217392
Victoria Ashworth,(171) 555-1555,Fauntleroy Circus,26695.0,27,988.7037037037036
Giovanni Rovelli,035-640555,Via Ludovico il Moro 22,25325.0,25,1013.0
Janine Labrune,40.67.8555,"67, rue des Cinquante Otages",24459.0,26,940.7307692307692
Peter Ferguson,03 9520 4555,636 St Kilda Road,23679.0,25,947.16
Juri Hirano,6505556809,9408 Furth Circle,23261.0,20,1163.05


INFO:py4j.clientserver:Received command c on object id p0


In [0]:
# 8. Refund UDF with 10% Restocking Fee
def calculate_refund_with_fee(original_amount, is_refund):
    """Calculate final refund amount with 10% restocking fee"""
    if is_refund:
        restocking_fee = float(original_amount) * 0.10
        final_refund = float(original_amount) - restocking_fee
        return final_refund, restocking_fee
    return float(original_amount), 0.0

calculate_refund_udf = udf(calculate_refund_with_fee, StructType([
    StructField("final_amount", DoubleType(), True),
    StructField("restocking_fee", DoubleType(), True)
]))

# Enhanced refund analysis
refund_analysis = fact_transactions.withColumn(
    "is_refund", col("transaction_type") == "REFUND"
).withColumn(
    "refund_calculation", calculate_refund_udf(col("total_amount"), col("is_refund"))
).select(
    "*",
    col("refund_calculation.final_amount").alias("calculated_final_amount"),
    col("refund_calculation.restocking_fee").alias("calculated_restocking_fee")
)

logger.info("Refund UDF with 10% restocking fee implemented")
display(refund_analysis.filter(col("transaction_type") == "REFUND").limit(50))


INFO:__main__:Refund UDF with 10% restocking fee implemented
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


order_number,customer_name,product_code,quantity,unit_price,total_amount,deal_size,territory,order_date,status,product_line,created_date,transaction_type,restocking_fee,net_amount,is_large_order,address,phone_number,is_refund,refund_calculation,calculated_final_amount,calculated_restocking_fee
10107,Kwai Yu,S10_1678,30,66.43333333333334,-1993.0,Small,,2003-02-24,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,199.3,1793.7,False,897 Long Airport Avenue,2125557818,True,"List(-1793.7, -199.3)",-1793.7,-199.3
10121,Paul Henriot,S10_1678,34,2.7058823529411766,-92.0,Small,EMEA,2003-05-07,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,9.2,82.8,False,59 rue de l'Abbaye,26.47.1555,True,"List(-82.8, -9.200000000000001)",-82.8,-9.2
10134,Daniel Da Cunha,S10_1678,41,14.0,-574.0,Medium,EMEA,2003-07-01,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,57.400000000000006,516.6,False,27 rue du Colonel Pierre Avia,#ERROR!,True,"List(-516.6, -57.400000000000006)",-516.6,-57.400000000000006
10145,Julie Young,S10_1678,45,13.844444444444443,-623.0,Medium,,2003-08-25,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,62.3,560.7,False,78934 Hillside Dr.,6265557265,True,"List(-560.7, -62.300000000000004)",-560.7,-62.3
10159,Julie Brown,S10_1678,49,9.020408163265309,-442.0,Medium,,2003-10-10,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,44.2,397.8,False,7734 Strong St.,6505551386,True,"List(-397.8, -44.2)",-397.8,-44.2
10180,Martine Rance,S10_1678,29,57.55172413793103,-1669.0,Small,EMEA,2003-11-11,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,166.9,1502.1,False,"184, chausse de Tournai",20.16.1555,True,"List(-1502.1, -166.9)",-1502.1,-166.9
10211,Dominique Perrier,S10_1678,41,18.07317073170732,-741.0,Medium,EMEA,2004-01-15,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,74.10000000000001,666.9,False,"25, rue Lauriston",(1) 47.55.6555,True,"List(-666.9, -74.10000000000001)",-666.9,-74.10000000000001
10275,Janine Labrune,S10_1678,45,4.955555555555556,-223.0,Medium,EMEA,2004-07-23,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,22.3,200.7,False,"67, rue des Cinquante Otages",40.67.8555,True,"List(-200.7, -22.3)",-200.7,-22.3
10285,Marta Hernandez,S10_1678,36,18.13888888888889,-653.0,Medium,,2004-08-27,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,65.3,587.7,False,39323 Spinnaker Dr.,6175558555,True,"List(-587.7, -65.3)",-587.7,-65.3
10299,Matti Karttunen,S10_1678,23,57.30434782608695,-1318.0,Small,EMEA,2004-09-30,COMPLETED,General,2025-06-23T16:34:20.218611Z,REFUND,131.8,1186.2,False,Keskuskatu 45,90-224 8555,True,"List(-1186.2, -131.8)",-1186.2,-131.8


INFO:py4j.clientserver:Received command c on object id p0


In [0]:
# 9. Fiscal Date Aggregations using Calendar Dimension
fiscal_aggregations = fact_transactions.filter(col("transaction_type") == "SALE") \
    .join(calendar_df, date_format(fact_transactions.order_date, "yyyy-MM-dd") == calendar_df.CALENDAR_DATE, "left") \
    .groupBy("FISCAL_YEAR", "FISCAL_QUARTER", "FISCAL_MONTH_NAME") \
    .agg(
        sum("net_amount").alias("fiscal_sales"),
        count("order_number").alias("fiscal_orders"),
        avg("net_amount").alias("fiscal_avg_order")
    ).orderBy("FISCAL_YEAR", "FISCAL_QUARTER") \
    .withColumn("created_date", current_timestamp())

logger.info(f"Fiscal Aggregations: {fiscal_aggregations.count()} records")
display(fiscal_aggregations)


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Fiscal Aggregations: 1 records
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


FISCAL_YEAR,FISCAL_QUARTER,FISCAL_MONTH_NAME,fiscal_sales,fiscal_orders,fiscal_avg_order,created_date
,,,1395467.0,1384,1008.2854046242776,2025-06-23T15:12:28.917427Z


INFO:py4j.clientserver:Received command c on object id p0
