In [0]:
# Load data
events_oct = spark.read.csv("/Volumes/workspace/ecommerce_idc/ecommerce_data_idc/2019-Oct.csv",header=True,inferSchema=True)

### Pyspark Transformations

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

#### Top 5 products by revenue

In [0]:
# Top 5 products by revenue
revenue = events_oct.filter(F.col("event_type") == "purchase") \
    .groupBy("product_id") \
    .agg(F.sum("price").alias("revenue")) \
    .orderBy(F.desc("revenue")).limit(5)
display(revenue)

product_id,revenue
1005115,12406807.350000003
1005105,10239248.679999996
1004249,6730112.920000011
1005135,5567806.640000007
1004767,5430723.430000007


#### Ruuning Total per user

In [0]:
# Running total per user
window = Window.partitionBy("user_id").orderBy("event_time")
events_oct.withColumn("cumulative_events", F.count("*").over(window)).show()

+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+-----------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|cumulative_events|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+-----------------+
|2019-10-09 10:30:19|      view|  17301541|2053013553853497655|                NULL|    NULL|162.17|205053188|e1eadbc6-aef5-4cf...|                1|
|2019-10-09 10:30:44|      view|  17301541|2053013553853497655|                NULL|    NULL|162.17|205053188|e1eadbc6-aef5-4cf...|                2|
|2019-10-07 06:23:01|      view|  16200119|2053013556344914381|   kids.fmcg.diapers|   moony| 18.47|222907508|cb653adc-46a2-4d9...|                1|
|2019-10-07 06:26:23|      view|  16200162|2053013556344914381|   kids.fmcg.diapers|   moony| 18.47|

In [0]:
events_oct.select("event_type").distinct().show()

+----------+
|event_type|
+----------+
|  purchase|
|      cart|
|      view|
+----------+



#### Conversion rate by category

In [0]:
# Conversion rate by category
events_oct.groupBy("category_code", "event_type").count().filter(F.col("category_code") == "kids.swing").show()



+-------------+----------+-----+
|category_code|event_type|count|
+-------------+----------+-----+
|   kids.swing|  purchase|  330|
|   kids.swing|      view|31596|
|   kids.swing|      cart|  147|
+-------------+----------+-----+



In [0]:
result_df = events_oct.groupBy("category_code", "event_type").count() \
    .groupBy("category_code")\
    .pivot("event_type").sum("count")\
    .withColumn("conversion_rate", F.col("purchase")/F.col("view")*100)

display(result_df)

category_code,cart,purchase,view,conversion_rate
auto.accessories.parktronic,,46.0,12305,0.3738317757009346
furniture.living_room.sofa,,1084.0,215471,0.5030839416905292
stationery.cartrige,106.0,134.0,7380,1.815718157181572
sport.bicycle,693.0,838.0,128759,0.6508282916145668
apparel.sock,7.0,21.0,2621,0.8012209080503624
appliances.environment.fan,16.0,27.0,2172,1.2430939226519335
kids.swing,147.0,330.0,31596,1.044436004557539
electronics.audio.microphone,196.0,430.0,28394,1.5144044516447135
auto.accessories.radar,716.0,494.0,42350,1.166469893742621
electronics.clocks,20344.0,17906.0,1272783,1.4068384005757462


#### Joins

In [0]:
orders = spark.createDataFrame([
    (1, 101),
    (2, 102),
    (3, 101),
    (4, 104)
], ["order_id", "customer_id"])

customers = spark.createDataFrame([
    (101, "Alice"),
    (102, "Bob"),
    (103, "Carol")
], ["customer_id", "customer_name"])


In [0]:
inner_df = customers.join(orders, "customer_id","inner")
inner_df.show()


+-----------+-------------+--------+
|customer_id|customer_name|order_id|
+-----------+-------------+--------+
|        101|        Alice|       3|
|        101|        Alice|       1|
|        102|          Bob|       2|
+-----------+-------------+--------+



In [0]:
left_df = customers.join(orders, "customer_id","left")
left_df.show()

+-----------+-------------+--------+
|customer_id|customer_name|order_id|
+-----------+-------------+--------+
|        101|        Alice|       3|
|        101|        Alice|       1|
|        102|          Bob|       2|
|        103|        Carol|    NULL|
+-----------+-------------+--------+



In [0]:
right_df = customers.join(orders, "customer_id","right")
right_df.show()

+-----------+-------------+--------+
|customer_id|customer_name|order_id|
+-----------+-------------+--------+
|        101|        Alice|       1|
|        102|          Bob|       2|
|        101|        Alice|       3|
|        104|         NULL|       4|
+-----------+-------------+--------+



In [0]:
outer_df = customers.join(orders, "customer_id","outer")
outer_df.show()

+-----------+-------------+--------+
|customer_id|customer_name|order_id|
+-----------+-------------+--------+
|        101|        Alice|       1|
|        102|          Bob|       2|
|        101|        Alice|       3|
|        104|         NULL|       4|
|        103|        Carol|    NULL|
+-----------+-------------+--------+



#### UDF 

In [0]:
orders = spark.createDataFrame([
    (1, 101, 12000),
    (2, 102, 45000),
    (3, 101, 8000),
    (4, 103, 67000),
    (5, 104, 3000)
], ["order_id", "customer_id", "order_amount"])

orders.show()


+--------+-----------+------------+
|order_id|customer_id|order_amount|
+--------+-----------+------------+
|       1|        101|       12000|
|       2|        102|       45000|
|       3|        101|        8000|
|       4|        103|       67000|
|       5|        104|        3000|
+--------+-----------+------------+



In [0]:
def order_value_category(amount):
    if amount < 10000:
        return "LOW"
    elif amount <= 50000:
        return "MEDIUM"
    else:
        return "HIGH"


In [0]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

order_value_udf = udf(order_value_category, StringType())

In [0]:
orders_with_category = orders.withColumn(
    "order_category",
    order_value_udf(orders["order_amount"])
)

orders_with_category.show()


+--------+-----------+------------+--------------+
|order_id|customer_id|order_amount|order_category|
+--------+-----------+------------+--------------+
|       1|        101|       12000|        MEDIUM|
|       2|        102|       45000|        MEDIUM|
|       3|        101|        8000|           LOW|
|       4|        103|       67000|          HIGH|
|       5|        104|        3000|           LOW|
+--------+-----------+------------+--------------+

