## **_Creating Schema And Loading Data_**

In [0]:
from pyspark.sql.types import *

dim_customers_schema = StructType([
    StructField("customer_key",IntegerType()),
    StructField("customer_id",IntegerType()),
    StructField("customer_number",StringType()),
    StructField("first_name",StringType()),
    StructField("last_name",StringType()),
    StructField("country",StringType()),
    StructField("marital_status",StringType()),
    StructField("gender",StringType()),
    StructField("birthdate",DateType()),
    StructField("create_date",DateType()),
])

product_schema = StructType([
    StructField("product_key", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("product_number", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("subcategory", StringType(), True),
    StructField("maintenance", StringType(), True),
    StructField("cost", IntegerType(), True),
    StructField("product_line", StringType(), True),
    StructField("start_date", DateType(), True)
])

sales_Schema = StructType([
    StructField("order_number", StringType()),
    StructField("product_key", IntegerType()),
    StructField("customer_key", IntegerType()),
    StructField("order_date", DateType()),
    StructField("shipping_date", DateType()),
    StructField("due_date", DateType()),
    StructField("sales_amount", IntegerType()),
    StructField("quantity", IntegerType()),
    StructField("price", IntegerType())

])

customers_df = spark.read \
    .format("csv") \
    .option("header",True) \
    .schema(dim_customers_schema) \
    .load("/Volumes/workspace/default/sql_data/gold.dim_customers.csv")

product_df = spark.read \
    .format("csv") \
    .option("header",True) \
    .schema(product_schema) \
    .load("/Volumes/workspace/default/sql_data/gold.dim_products.csv")

sales_df = spark.read \
    .format("csv") \
    .option("header",True) \
    .schema(sales_Schema) \
    .load("/Volumes/workspace/default/sql_data/gold.fact_sales.csv")

sales_df.show()
customers_df.show()
product_df.show()

+------------+-----------+------------+----------+-------------+----------+------------+--------+-----+
|order_number|product_key|customer_key|order_date|shipping_date|  due_date|sales_amount|quantity|price|
+------------+-----------+------------+----------+-------------+----------+------------+--------+-----+
|     SO54496|        282|        5400|2013-03-16|   2013-03-23|2013-03-28|          25|       1|   25|
|     SO54496|        289|        5400|2013-03-16|   2013-03-23|2013-03-28|           5|       1|    5|
|     SO54496|        259|        5400|2013-03-16|   2013-03-23|2013-03-28|           2|       1|    2|
|     SO54497|        174|        9281|2013-03-16|   2013-03-23|2013-03-28|          22|       1|   22|
|     SO54497|        280|        9281|2013-03-16|   2013-03-23|2013-03-28|           9|       1|    9|
|     SO54498|        174|        4825|2013-03-16|   2013-03-23|2013-03-28|          22|       1|   22|
|     SO54498|        277|        4825|2013-03-16|   2013-03-23|

## _**Magnitude Analysis**_

### **_Purpose :_**
- **_To quantify data and group results by specific dimensions._**
- **_For understanding data distribution across categories._**

### **_SQL Functions Used :_**
- **_Aggregate Functions: SUM(), COUNT(), AVG()_**
- **_GROUP BY, ORDER BY_**

In [0]:
# Find total customers by countries
from pyspark.sql.functions import *

customers_df.groupBy("country") \
    .agg(count("customer_id").alias("total_customer")) \
    .orderBy(col("total_customer").desc()) \
    .show()

+--------------+--------------+
|       country|total_customer|
+--------------+--------------+
| United States|          7482|
|     Australia|          3591|
|United Kingdom|          1913|
|        France|          1810|
|       Germany|          1780|
|        Canada|          1571|
|           n/a|           337|
+--------------+--------------+



In [0]:
#Find total customers by gender
customers_df.groupBy("gender") \
    .agg(count("customer_id").alias("total_customer")) \
    .orderBy(col("total_customer").desc()) \
    .show()

+------+--------------+
|gender|total_customer|
+------+--------------+
|  Male|          9341|
|Female|          9128|
|   n/a|            15|
+------+--------------+



In [0]:
# What is the total revenue generated for each category?

sales_df.join(product_df, sales_df.product_key==product_df.product_key,"left") \
    .groupBy("category") \
    .agg(sum("sales_amount").alias("total_revenue")) \
    .orderBy(col("total_revenue").desc()) \
    .show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|      Bikes|     28316272|
|Accessories|       700262|
|   Clothing|       339716|
+-----------+-------------+



In [0]:
# What is the total revenue generated by each customer?

sales_df.join(customers_df, sales_df.customer_key == customers_df.customer_key,"left") \
.groupBy(customers_df.customer_key,"first_name","last_name") \
.agg(sum("sales_amount").alias("total_revenue")) \
.orderBy(col("total_revenue").desc()) \
.show()

+------------+----------+---------+-------------+
|customer_key|first_name|last_name|total_revenue|
+------------+----------+---------+-------------+
|        1302|   Nichole|     Nara|        13294|
|        1133|   Kaitlyn|Henderson|        13294|
|        1309|  Margaret|       He|        13268|
|        1132|   Randall|Dominguez|        13265|
|        1301|   Adriana| Gonzalez|        13242|
|        1322|      Rosa|       Hu|        13215|
|        1125|    Brandi|     Gill|        13195|
|        1308|      Brad|      She|        13172|
|        1297| Francisco|     Sara|        13164|
|         434|   Maurice|     Shan|        12914|
|         440|     Janet|    Munoz|        12488|
|         242|      Lisa|      Cai|        11468|
|         418|     Lacey|    Zheng|        11248|
|         421|    Jordan|   Turner|        11200|
|         243|     Larry|    Munoz|        11067|
|        1656|     Larry|  Vazquez|        10899|
|        2264|      Kate|    Anand|        10871|


In [0]:
#-- What is the distribution of sold items across countries?

sales_df.join(customers_df,"customer_key","left") \
.groupBy("country") \
.agg(sum("quantity").alias("total_item_sold")) \
.orderBy(col("total_item_sold").desc()) \
.show()

+--------------+---------------+
|       country|total_item_sold|
+--------------+---------------+
| United States|          20481|
|     Australia|          13346|
|        Canada|           7630|
|United Kingdom|           6910|
|       Germany|           5626|
|        France|           5559|
|           n/a|            871|
+--------------+---------------+



# Ranking Analysis

**_Which 5 products Generating the Highest Revenue?_**

**_Importing Library_**

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *

In [0]:
# Calculate total revenue for each product
total_rev = (
    sales_df.join(product_df, "product_key", "left")
        .groupBy("product_name")
        .agg(sum("sales_amount").alias("total_revenue"))
)

# Window for ranking by revenue (highest first)
window_spec = Window.orderBy(col("total_revenue").desc())

# Add rank and filter top 5 products
top_5_products = (
    total_rev.withColumn("rank", dense_rank().over(window_spec))
             .filter(col("rank") <= 5)
)

top_5_products.show()



+--------------------+-------------+----+
|        product_name|total_revenue|rank|
+--------------------+-------------+----+
|Mountain-200 Blac...|      1373454|   1|
|Mountain-200 Blac...|      1363128|   2|
|Mountain-200 Silv...|      1339394|   3|
|Mountain-200 Silv...|      1301029|   4|
|Mountain-200 Blac...|      1294854|   5|
+--------------------+-------------+----+



 **_What are the 5 worst-performing products in terms of sales?_**


In [0]:
# Calculate total revenue for each product
total_rev = (
    sales_df.join(product_df, "product_key", "left")
        .groupBy("product_name")
        .agg(sum("sales_amount").alias("total_revenue"))
)

# Window for ranking by revenue (Lowest first)
window_spec = Window.orderBy(col("total_revenue"))

# Add rank and filter Worst 5 products
top_5_products = (
    total_rev.withColumn("rank", dense_rank().over(window_spec))
             .filter(col("rank") <= 5)
)

top_5_products.show()



+--------------------+-------------+----+
|        product_name|total_revenue|rank|
+--------------------+-------------+----+
|     Racing Socks- L|         2430|   1|
|     Racing Socks- M|         2682|   2|
| Patch Kit/8 Patches|         6382|   3|
|Bike Wash - Disso...|         7272|   4|
|   Touring Tire Tube|         7440|   5|
+--------------------+-------------+----+



