#Read data into DataFrame

In [0]:
events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)


#Perform basic operations

In [0]:
#SELECTING THE COLUMN
events.select("event_time", "event_type", "price").show(5)

+-------------------+----------+------+
|         event_time|event_type| price|
+-------------------+----------+------+
|2019-11-01 00:00:00|      view|489.07|
|2019-11-01 00:00:00|      view|293.65|
|2019-11-01 00:00:01|      view| 28.31|
|2019-11-01 00:00:01|      view|712.87|
|2019-11-01 00:00:01|      view|183.27|
+-------------------+----------+------+
only showing top 5 rows


In [0]:
#FILTER BY CONDITION
events.filter("event_type = 'purchase'").show(5)

+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|2019-11-01 00:00:41|  purchase|  13200605|2053013557192163841|furniture.bedroom...|   NULL| 566.3|559368633|d6034fa2-41fb-4ac...|
|2019-11-01 00:01:04|  purchase|   1005161|2053013555631882655|electronics.smart...| xiaomi|211.92|513351129|e6b7ce9b-1938-4e2...|
|2019-11-01 00:04:51|  purchase|   1004856|2053013555631882655|electronics.smart...|samsung|128.42|562958505|0f039697-fedc-40f...|
|2019-11-01 00:05:34|  purchase|  26401669|2053013563651392361|                NULL|lucente|109.66|541854711|c41c44d5-ef9b-41b...|
|2019-11-01 00:06:33|  purchase|   1801881|2053013554415534427|electronics.video.tv

In [0]:
#GROUP BY CONDITION
events.groupBy("event_type").count().show()

+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase|  916939|
|      cart| 3028930|
|      view|63556110|
+----------+--------+



In [0]:
#ORDER BY CONDITION
events.orderBy("price", ascending=False).show(5)

+-------------------+----------+----------+-------------------+------------------+-----+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|     category_code|brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+------------------+-----+-------+---------+--------------------+
|2019-11-01 00:07:29|      view|  21408165|2053013561579406073|electronics.clocks| NULL|2574.07|513118352|4c14bf2a-2820-450...|
|2019-11-01 02:18:16|      view|  21408156|2053013561579406073|electronics.clocks| NULL|2574.07|515129276|dacc1c2b-d20e-4fa...|
|2019-11-01 00:45:07|      view|  21408160|2053013561579406073|electronics.clocks| NULL|2574.07|512376038|3ec14458-4d55-4f7...|
|2019-11-01 00:08:00|      view|  21408165|2053013561579406073|electronics.clocks| NULL|2574.07|513118352|4c14bf2a-2820-450...|
|2019-11-01 00:45:48|      view|  21408160|2053013561579406073|electronics.clocks| NULL|2574.07|51237603

In [0]:
# Create a DataFrame of the top 5 brands by event count
top_brands = (
    events.groupBy("brand")
    .count()
    .orderBy("count", ascending=False)
    .limit(5)
)
# Write the top 5 brands DataFrame to a single CSV file (overwrite if it already exists)
top_brands.coalesce(1).write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("dbfs:/Volumes/workspace/default/analytics_volume/top_brands_single_csv")
