In [1]:
# Install PySpark in Google Colab
!pip install pyspark

# Import required libraries
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
    .appName("RetailPro Analytics") \
    .master("local[*]") \
    .getOrCreate()

# Verify Spark is working
print(f"Spark Version: {spark.version}")
print("Spark session created successfully!")

Spark Version: 3.5.1
Spark session created successfully!


In [2]:

# Download the CSV file from GitHub
!wget https://raw.githubusercontent.com/futurexskill/bigdata/master/ecommerce_orders.csv

--2025-12-09 00:19:38--  https://raw.githubusercontent.com/futurexskill/bigdata/master/ecommerce_orders.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8804 (8.6K) [text/plain]
Saving to: â€˜ecommerce_orders.csvâ€™


2025-12-09 00:19:38 (58.9 MB/s) - â€˜ecommerce_orders.csvâ€™ saved [8804/8804]



In [3]:
orders_df = spark.read.csv("ecommerce_orders.csv", header=True, inferSchema=True)

print("âœ… Data loaded!")
orders_df.show(5)

âœ… Data loaded!
+--------+----------+-----------+------------+-----------+--------+----------+----------------+------------+-------+---------+--------------+-------------+
|order_id|order_date|customer_id|product_name|   category|quantity|unit_price|discount_percent|total_amount|country|   status|payment_method|shipping_days|
+--------+----------+-----------+------------+-----------+--------+----------+----------------+------------+-------+---------+--------------+-------------+
|    1001|2025-01-15|       5023|      LAPTOP|Electronics|       2|    899.99|              10|     1619.98|    USA|Completed|   Credit Card|            5|
|    1002|2025-01-18|       5012|  HEADPHONES|Electronics|       1|     79.99|               0|       79.99|     UK|Completed|        PayPal|            3|
|    1003|2025-02-03|       5045|    KEYBOARD|Electronics|       3|     49.99|              15|      127.47| Canada|  Shipped|    Debit Card|            7|
|    1004|2025-02-10|       5008|     MONITOR|E

In [5]:
# Create Temp View
create_view_code =  orders_df.createOrReplaceTempView("orders")


In [6]:
# Code Cell 4: Simple SELECT
query1 = """
SELECT product_name, category, total_amount
FROM orders
LIMIT 10
"""

result1 = spark.sql(query1)
result1.show()

+------------+-----------+------------+
|product_name|   category|total_amount|
+------------+-----------+------------+
|      LAPTOP|Electronics|     1619.98|
|  HEADPHONES|Electronics|       79.99|
|    KEYBOARD|Electronics|      127.47|
|     MONITOR|Electronics|      239.99|
|       MOUSE|Electronics|      142.45|
|       CHAIR|  Furniture|      179.99|
|        DESK|  Furniture|      699.98|
|        LAMP|  Furniture|      113.97|
|    NOTEBOOK| Stationery|        49.9|
|     PEN_SET| Stationery|       46.76|
+------------+-----------+------------+



In [7]:
# Code Cell 5: WHERE Clause
query2 = """
SELECT product_name, category, total_amount, country
FROM orders
WHERE total_amount > 500
ORDER BY total_amount DESC
"""

result2 = spark.sql(query2)
print("Orders over $500:")
result2.show()

Orders over $500:
+------------+-----------+------------+-------+
|product_name|   category|total_amount|country|
+------------+-----------+------------+-------+
|      LAPTOP|Electronics|     1709.98| France|
|      LAPTOP|Electronics|     1709.98|    USA|
|      LAPTOP|Electronics|     1619.98|    USA|
|      LAPTOP|Electronics|      899.99|    USA|
|      LAPTOP|Electronics|      899.99| France|
|      LAPTOP|Electronics|      899.99|    USA|
|      LAPTOP|Electronics|      809.99|    USA|
|      LAPTOP|Electronics|      809.99| France|
|      LAPTOP|Electronics|      719.99| France|
|        DESK|  Furniture|      699.98|  Japan|
|        DESK|  Furniture|      699.98| Canada|
|        DESK|  Furniture|      664.98| Canada|
|        DESK|  Furniture|      629.98| Canada|
|     MONITOR|Electronics|      509.98|  India|
+------------+-----------+------------+-------+



In [8]:
# Code Cell 6: GROUP BY
query3 = """
SELECT
    category,
    COUNT(*) as order_count,
    SUM(total_amount) as total_revenue,
    AVG(total_amount) as avg_order_value,
    MAX(total_amount) as max_order
FROM orders
GROUP BY category
ORDER BY total_revenue DESC
"""

result3 = spark.sql(query3)
print("Revenue by Category:")
result3.show()

Revenue by Category:
+-----------+-----------+-----------------+------------------+---------+
|   category|order_count|    total_revenue|   avg_order_value|max_order|
+-----------+-----------+-----------------+------------------+---------+
|Electronics|         44|15439.69999999999| 350.9022727272725|  1709.98|
|  Furniture|         24|6648.569999999999|277.02374999999995|   699.98|
|Accessories|         16|          1028.64|             64.29|    95.98|
| Stationery|         16|545.4499999999999|34.090624999999996|    56.89|
+-----------+-----------+-----------------+------------------+---------+



In [9]:
# Code Cell 7: Create Second DataFrame and View
from pyspark.sql import Row

category_data = [
    Row(category="Electronics", category_code="ELE", margin_percent=15),
    Row(category="Furniture", category_code="FUR", margin_percent=25),
    Row(category="Stationery", category_code="STA", margin_percent=30),
    Row(category="Accessories", category_code="ACC", margin_percent=20)
]

category_df = spark.createDataFrame(category_data)
category_df.createOrReplaceTempView("categories")

print("âœ… Category lookup view created!")
category_df.show()

âœ… Category lookup view created!
+-----------+-------------+--------------+
|   category|category_code|margin_percent|
+-----------+-------------+--------------+
|Electronics|          ELE|            15|
|  Furniture|          FUR|            25|
| Stationery|          STA|            30|
|Accessories|          ACC|            20|
+-----------+-------------+--------------+



Products above average price:
+------------+-----------+------------+
|product_name|   category|total_amount|
+------------+-----------+------------+
|      LAPTOP|Electronics|     1709.98|
|      LAPTOP|Electronics|     1709.98|
|      LAPTOP|Electronics|     1619.98|
|      LAPTOP|Electronics|      899.99|
|      LAPTOP|Electronics|      899.99|
|      LAPTOP|Electronics|      899.99|
|      LAPTOP|Electronics|      809.99|
|      LAPTOP|Electronics|      809.99|
|      LAPTOP|Electronics|      719.99|
|        DESK|  Furniture|      699.98|
|        DESK|  Furniture|      699.98|
|        DESK|  Furniture|      664.98|
|        DESK|  Furniture|      629.98|
|     MONITOR|Electronics|      509.98|
|       CHAIR|  Furniture|      359.98|
|        DESK|  Furniture|      349.99|
|        DESK|  Furniture|      332.49|
|        DESK|  Furniture|      332.49|
|       CHAIR|  Furniture|      319.98|
|        DESK|  Furniture|      314.99|
+------------+-----------+------------+
only showi

In [11]:
# Code Cell 8: JOIN Query
query4 = """
SELECT
    o.category,
    c.category_code,
    COUNT(*) as order_count,
    SUM(o.total_amount) as revenue,
    c.margin_percent,
    ROUND(SUM(o.total_amount) * c.margin_percent / 100, 2) as estimated_profit
FROM orders o
JOIN categories c ON o.category = c.category
GROUP BY o.category, c.category_code, c.margin_percent
ORDER BY estimated_profit DESC
"""

result4 = spark.sql(query4)
print("Profit Analysis by Category:")
result4.show()

Profit Analysis by Category:
+-----------+-------------+-----------+------------------+--------------+----------------+
|   category|category_code|order_count|           revenue|margin_percent|estimated_profit|
+-----------+-------------+-----------+------------------+--------------+----------------+
|Electronics|          ELE|         44|15439.699999999992|            15|         2315.95|
|  Furniture|          FUR|         24|           6648.57|            25|         1662.14|
|Accessories|          ACC|         16|           1028.64|            20|          205.73|
| Stationery|          STA|         16| 545.4499999999999|            30|          163.64|
+-----------+-------------+-----------+------------------+--------------+----------------+



In [12]:
# Code Cell 9: Subquery
query5 = """
SELECT product_name, category, total_amount
FROM orders
WHERE total_amount > (
    SELECT AVG(total_amount) FROM orders
)
ORDER BY total_amount DESC
"""

result5 = spark.sql(query5)
print("Products above average price:")
result5.show()

Products above average price:
+------------+-----------+------------+
|product_name|   category|total_amount|
+------------+-----------+------------+
|      LAPTOP|Electronics|     1709.98|
|      LAPTOP|Electronics|     1709.98|
|      LAPTOP|Electronics|     1619.98|
|      LAPTOP|Electronics|      899.99|
|      LAPTOP|Electronics|      899.99|
|      LAPTOP|Electronics|      899.99|
|      LAPTOP|Electronics|      809.99|
|      LAPTOP|Electronics|      809.99|
|      LAPTOP|Electronics|      719.99|
|        DESK|  Furniture|      699.98|
|        DESK|  Furniture|      699.98|
|        DESK|  Furniture|      664.98|
|        DESK|  Furniture|      629.98|
|     MONITOR|Electronics|      509.98|
|       CHAIR|  Furniture|      359.98|
|        DESK|  Furniture|      349.99|
|        DESK|  Furniture|      332.49|
|        DESK|  Furniture|      332.49|
|       CHAIR|  Furniture|      319.98|
|        DESK|  Furniture|      314.99|
+------------+-----------+------------+
only showi

In [13]:
# Code Cell 10: CTE (WITH clause)
query6 = """
WITH country_revenue AS (
    SELECT
        country,
        COUNT(*) as order_count,
        SUM(total_amount) as total_revenue
    FROM orders
    GROUP BY country
),
avg_revenue AS (
    SELECT AVG(total_revenue) as avg_rev
    FROM country_revenue
)
SELECT
    cr.country,
    cr.order_count,
    cr.total_revenue,
    ar.avg_rev as average_revenue,
    ROUND(cr.total_revenue - ar.avg_rev, 2) as diff_from_avg
FROM country_revenue cr
CROSS JOIN avg_revenue ar
WHERE cr.total_revenue > ar.avg_rev
ORDER BY cr.total_revenue DESC
"""

result6 = spark.sql(query6)
print("Countries with above-average revenue:")
result6.show()

Countries with above-average revenue:
+-------+-----------+------------------+------------------+-------------+
|country|order_count|     total_revenue|   average_revenue|diff_from_avg|
+-------+-----------+------------------+------------------+-------------+
|    USA|         13| 6463.469999999999|2957.7949999999996|      3505.67|
| France|         12|4647.0199999999995|2957.7949999999996|      1689.23|
| Canada|         13|3101.2599999999998|2957.7949999999996|       143.47|
+-------+-----------+------------------+------------------+-------------+



In [14]:
# Code Cell 11: DataFrame API vs SQL Comparison
from pyspark.sql.functions import sum, count

# Approach 1: DataFrame API
df_result = orders_df.groupBy("category").agg(
    count("*").alias("order_count"),
    sum("total_amount").alias("total_revenue")
).orderBy("total_revenue", ascending=False)

print("Using DataFrame API:")
df_result.show()


Using DataFrame API:
+-----------+-----------+-----------------+
|   category|order_count|    total_revenue|
+-----------+-----------+-----------------+
|Electronics|         44|15439.69999999999|
|  Furniture|         24|6648.569999999999|
|Accessories|         16|          1028.64|
| Stationery|         16|545.4499999999999|
+-----------+-----------+-----------------+



In [15]:

# Approach 2: SQL
sql_result = spark.sql("""
    SELECT
        category,
        COUNT(*) as order_count,
        SUM(total_amount) as total_revenue
    FROM orders
    GROUP BY category
    ORDER BY total_revenue DESC
""")

print("\nUsing SQL:")
sql_result.show()

print("\nâœ… Both produce identical results!")


Using SQL:
+-----------+-----------+-----------------+
|   category|order_count|    total_revenue|
+-----------+-----------+-----------------+
|Electronics|         44|15439.69999999999|
|  Furniture|         24|6648.569999999999|
|Accessories|         16|          1028.64|
| Stationery|         16|545.4499999999999|
+-----------+-----------+-----------------+


âœ… Both produce identical results!


In [16]:
# Code Cell 12: Global Temporary View
orders_df.createGlobalTempView("global_orders")

# Access global view (note the global_temp prefix)
result = spark.sql("SELECT * FROM global_temp.global_orders LIMIT 5")
print("Global view query:")
result.show()

print("\nâœ… Global views accessible across sessions with global_temp prefix")

Global view query:
+--------+----------+-----------+------------+-----------+--------+----------+----------------+------------+-------+---------+--------------+-------------+
|order_id|order_date|customer_id|product_name|   category|quantity|unit_price|discount_percent|total_amount|country|   status|payment_method|shipping_days|
+--------+----------+-----------+------------+-----------+--------+----------+----------------+------------+-------+---------+--------------+-------------+
|    1001|2025-01-15|       5023|      LAPTOP|Electronics|       2|    899.99|              10|     1619.98|    USA|Completed|   Credit Card|            5|
|    1002|2025-01-18|       5012|  HEADPHONES|Electronics|       1|     79.99|               0|       79.99|     UK|Completed|        PayPal|            3|
|    1003|2025-02-03|       5045|    KEYBOARD|Electronics|       3|     49.99|              15|      127.47| Canada|  Shipped|    Debit Card|            7|
|    1004|2025-02-10|       5008|     MONITOR

In [17]:
# Code Cell 13: List Views
print("Current temporary views:")
spark.catalog.listTables()

Current temporary views:


[Table(name='categories', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='orders', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [18]:
# Code Cell 14: Drop Views
spark.catalog.dropTempView("orders")
print("âœ… Temporary view 'orders' dropped")

print("\nRemaining views:")
spark.catalog.listTables()

âœ… Temporary view 'orders' dropped

Remaining views:


[Table(name='categories', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [19]:
# Code Cell 15: Real-World Example - Daily Sales Report
# Recreate view for final example
orders_df.createOrReplaceTempView("orders")

sales_report = spark.sql("""
    WITH daily_summary AS (
        SELECT
            category,
            country,
            COUNT(*) as orders,
            SUM(total_amount) as revenue,
            AVG(total_amount) as avg_order_value
        FROM orders
        GROUP BY category, country
    )
    SELECT
        category,
        country,
        orders,
        ROUND(revenue, 2) as revenue,
        ROUND(avg_order_value, 2) as avg_order_value,
        ROUND(revenue * 100.0 / SUM(revenue) OVER (PARTITION BY category), 2) as pct_of_category
    FROM daily_summary
    ORDER BY category, revenue DESC
""")

print("ðŸ“Š Daily Sales Report:")
sales_report.show(20, truncate=False)

# Save report to CSV
sales_report.coalesce(1).write.mode("overwrite").csv("daily_sales_report.csv", header=True)
print("\nâœ… Report saved to daily_sales_report.csv")

ðŸ“Š Daily Sales Report:
+-----------+---------+------+-------+---------------+---------------+
|category   |country  |orders|revenue|avg_order_value|pct_of_category|
+-----------+---------+------+-------+---------------+---------------+
|Accessories|India    |4     |324.84 |81.21          |31.58          |
|Accessories|Canada   |4     |293.94 |73.49          |28.58          |
|Accessories|Japan    |4     |242.95 |60.74          |23.62          |
|Accessories|Germany  |4     |166.91 |41.73          |16.23          |
|Electronics|USA      |9     |6290.82|698.98         |40.74          |
|Electronics|France   |8     |4483.34|560.42         |29.04          |
|Electronics|India    |4     |1274.95|318.74         |8.26           |
|Electronics|Germany  |5     |1229.95|245.99         |7.97           |
|Electronics|UK       |5     |671.91 |134.38         |4.35           |
|Electronics|Australia|4     |603.92 |150.98         |3.91           |
|Electronics|Canada   |5     |479.89 |95.98         