In [0]:
spark

<pyspark.sql.connect.session.SparkSession at 0xff9c81b58e60>

In [0]:
orders_df = spark.sql("SELECT * FROM workspace.default.ecommerce_messy_data")

In [0]:
orders_df.show()

+--------+-------------------+-----------+---------+--------------+------------+
|order_id|       product_name|   category|  country|payment_method|total_amount|
+--------+-------------------+-----------+---------+--------------+------------+
|       1|        LAPTOP_PRO |Electronics|      USA|   Credit Card|     1299.99|
|       2|       office chair|  Furniture|   Canada|    Debit Card|       299.5|
|       3|     wireless_Mouse|Electronics|       UK|        PayPal|       49.99|
|       4|          DESK_LAMP|  Furniture|      USA|   Credit Card|       79.99|
|       5|      notebook_SET | Stationery|  Germany| Bank Transfer|       29.99|
|       6|          USB Cable|Accessories|   France|        PayPal|       15.99|
|       7|mechanical_KEYBOARD|Electronics|Australia|    Debit Card|      159.99|
|       8|         MONITOR_4K|Electronics|    India|   Credit Card|      599.99|
|       9|   ergonomic_CHAIR |  Furniture|   Canada|        PayPal|      449.99|
|      10|wireless HEADPHONE

In [0]:
display(orders_df)

order_id,product_name,category,country,payment_method,total_amount
1,LAPTOP_PRO,Electronics,USA,Credit Card,1299.99
2,office chair,Furniture,Canada,Debit Card,299.5
3,wireless_Mouse,Electronics,UK,PayPal,49.99
4,DESK_LAMP,Furniture,USA,Credit Card,79.99
5,notebook_SET,Stationery,Germany,Bank Transfer,29.99
6,USB Cable,Accessories,France,PayPal,15.99
7,mechanical_KEYBOARD,Electronics,Australia,Debit Card,159.99
8,MONITOR_4K,Electronics,India,Credit Card,599.99
9,ergonomic_CHAIR,Furniture,Canada,PayPal,449.99
10,wireless HEADPHONES,Electronics,USA,Credit Card,199.99


In [0]:
orders_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- country: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [0]:
orders_df.count()

20

In [0]:
# ============================================
# CELL 5: Transform - Trim spaces from product_name
# ============================================
# Narration: "Now let's do a simple transformation"
# "We'll clean product names by removing leading and trailing spaces"

from pyspark.sql.functions import trim, col

cleaned_df = orders_df.withColumn("product_name", trim(col("product_name")))

In [0]:
display(cleaned_df)

order_id,product_name,category,country,payment_method,total_amount
1,LAPTOP_PRO,Electronics,USA,Credit Card,1299.99
2,office chair,Furniture,Canada,Debit Card,299.5
3,wireless_Mouse,Electronics,UK,PayPal,49.99
4,DESK_LAMP,Furniture,USA,Credit Card,79.99
5,notebook_SET,Stationery,Germany,Bank Transfer,29.99
6,USB Cable,Accessories,France,PayPal,15.99
7,mechanical_KEYBOARD,Electronics,Australia,Debit Card,159.99
8,MONITOR_4K,Electronics,India,Credit Card,599.99
9,ergonomic_CHAIR,Furniture,Canada,PayPal,449.99
10,wireless HEADPHONES,Electronics,USA,Credit Card,199.99


In [0]:

# ============================================
# CELL 7: Write cleaned data to new Delta Table
# ============================================
# Narration: "Now let's save this cleaned data as a new Delta Table"

cleaned_df.write.mode('overwrite').format('delta').saveAsTable("ecommerce_cleaned")



In [0]:
# ============================================
# CELL 8: Query the new cleaned table
# ============================================
# Narration: "Let's query the new cleaned table"

display(spark.sql("SELECT * FROM workspace.default.ecommerce_cleaned"))


order_id,product_name,category,country,payment_method,total_amount
1,LAPTOP_PRO,Electronics,USA,Credit Card,1299.99
2,office chair,Furniture,Canada,Debit Card,299.5
3,wireless_Mouse,Electronics,UK,PayPal,49.99
4,DESK_LAMP,Furniture,USA,Credit Card,79.99
5,notebook_SET,Stationery,Germany,Bank Transfer,29.99
6,USB Cable,Accessories,France,PayPal,15.99
7,mechanical_KEYBOARD,Electronics,Australia,Debit Card,159.99
8,MONITOR_4K,Electronics,India,Credit Card,599.99
9,ergonomic_CHAIR,Furniture,Canada,PayPal,449.99
10,wireless HEADPHONES,Electronics,USA,Credit Card,199.99


In [0]:
# ============================================
# CELL 9: Compare original vs cleaned (side by side)
# ============================================
# Narration: "Let's compare the original and cleaned product names"

print("ORIGINAL DATA:")
display(orders_df.select("order_id", "product_name"))

print("\nCLEANED DATA:")
display(cleaned_df.select("order_id", "product_name"))

ORIGINAL DATA:


order_id,product_name
1,LAPTOP_PRO
2,office chair
3,wireless_Mouse
4,DESK_LAMP
5,notebook_SET
6,USB Cable
7,mechanical_KEYBOARD
8,MONITOR_4K
9,ergonomic_CHAIR
10,wireless HEADPHONES



CLEANED DATA:


order_id,product_name
1,LAPTOP_PRO
2,office chair
3,wireless_Mouse
4,DESK_LAMP
5,notebook_SET
6,USB Cable
7,mechanical_KEYBOARD
8,MONITOR_4K
9,ergonomic_CHAIR
10,wireless HEADPHONES


In [0]:
# ============================================
# CELL 10 (Optional): Show specific examples of trimming
# ============================================
# Narration: "Let's see specific examples where trimming made a difference"

from pyspark.sql.functions import length, concat, lit

display(
    orders_df.select(
        "order_id",
        concat(lit("["), col("product_name"), lit("]")).alias("original_with_brackets"),
        length("product_name").alias("original_length")
    ).join(
        cleaned_df.select(
            "order_id",
            concat(lit("["), col("product_name"), lit("]")).alias("cleaned_with_brackets"),
            length("product_name").alias("cleaned_length")
        ),
        "order_id"
    ).filter(col("original_length") != col("cleaned_length"))
)

order_id,original_with_brackets,original_length,cleaned_with_brackets,cleaned_length
1,[LAPTOP_PRO ],11,[LAPTOP_PRO],10
3,[ wireless_Mouse],15,[wireless_Mouse],14
5,[notebook_SET ],13,[notebook_SET],12
8,[ MONITOR_4K],11,[MONITOR_4K],10
9,[ergonomic_CHAIR ],16,[ergonomic_CHAIR],15
11,[ PRINTER_INK],12,[PRINTER_INK],11
13,[NOTEBOOK_PACK ],14,[NOTEBOOK_PACK],13
14,[ tablet_stand],13,[tablet_stand],12
16,[desk_ORGANIZER ],15,[desk_ORGANIZER],14
17,[ LAPTOP_BAG],11,[LAPTOP_BAG],10
