#Load Your Dataset

In [0]:
events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)

events.printSchema()
events.show(5)

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|53

#Convert CSV DataFrame to Delta Format

In [0]:
events.write \
    .format("delta") \
    .mode("overwrite") \
    .save("/Volumes/workspace/ecommerce/ecommerce_delta")

spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_delta").show(5)

#Verify Delta Table

In [0]:
spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_delta").show(5)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:01|      view|   1004775|2053013555631882655|electronics.smart...|xiaomi

#Create Delta Tables (Pyspark)

In [0]:
events.write \
    .format("delta") \
    .saveAsTable("events_table")

#Query the Delta Table (SQL cell)

In [0]:
%sql
SELECT 
  event_type,
  COUNT(*) AS cnt
FROM events_table
GROUP BY event_type;

event_type,cnt
purchase,916939
cart,3028930
view,63556110


#Create Delta Table Using SQL

In [0]:
%sql
CREATE TABLE events_delta
USING DELTA
AS
SELECT * FROM events_table;



num_affected_rows,num_inserted_rows


#Test Schema Enforcement

In [0]:
try:
    wrong_schema = spark.createDataFrame(
        [("a", "b", "c")],
        ["x", "y", "z"]
    )

    wrong_schema.write \
        .format("delta") \
        .mode("append") \
        .save("/Volumes/workspace/ecommerce/ecommerce_data/events")

except Exception as e:
    print("Schema enforcement working!")
    print(e)

#Demonstrate Duplicate Insert Problem

In [0]:
events.write \
    .format("delta") \
    .mode("append") \
    .save("/Volumes/workspace/ecommerce/ecommerce_data/events")

spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/events").count()

67501979

In [0]:
# Remove the old Delta table and schema before writing new data
# This will delete all files at the specified path

dbutils.fs.rm("/Volumes/workspace/ecommerce/ecommerce_data/events", True)

True

#Handle Duplicates

In [0]:
events_deduped = events.dropDuplicates(
    ["user_id", "event_time", "product_id"]
)

events_deduped.write \
    .format("delta") \
    .mode("overwrite") \
    .save("/Volumes/workspace/ecommerce/ecommerce_data/events")