In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("question2").getOrCreate()

In [20]:
from pyspark.sql.functions import col, translate, explode


product_views = spark.read.json(r"data\product-views.json")
orders = spark.read.json(r"data\orders.json")
product_category_map = spark.read.csv(r"data\product-category-map.csv", header=True)

product_views = product_views.withColumn("event", col("event")) \
            .withColumn("messageid", col("messageid")) \
            .withColumn("userid", col("userid")) \
            .withColumn("productid", translate(col("properties").cast("string"), "[]","")) \
            .withColumn("source", translate(col("context").cast("string"), "[]", "")) \
            .drop("context").drop("properties")

orders = orders.select(orders.event,
                       orders.messageid,
                       orders.userid,
                       explode(orders.lineitems).alias("lineitems"),
                       orders.orderid) \
                .withColumn("lineitems", translate(col("lineitems").cast("string"), "[]", ""))

product_views.createOrReplaceTempView("product_views")
orders.createOrReplaceTempView("orders")
product_category_map.createOrReplaceTempView("product_category_map")

In [4]:
spark.sql("select * from orders o limit 5").show(truncate=False)

+----------+------------------------------------+--------+--------------+-------+
|event     |messageid                           |userid  |lineitems     |orderid|
+----------+------------------------------------+--------+--------------+-------+
|OrderEvent|2db62eb5-de95-4ce8-8161-ab7552dc2fd7|user-346|product-784, 3|50000  |
|OrderEvent|2db62eb5-de95-4ce8-8161-ab7552dc2fd7|user-346|product-173, 1|50000  |
|OrderEvent|f05643d0-29e7-47ca-a1b3-12ba3e8642e3|user-391|product-424, 1|50001  |
|OrderEvent|30d5cb63-63bb-42a2-b27a-673f3a4e7925|user-120|product-393, 3|50002  |
|OrderEvent|60c21fb3-a115-4f10-b595-6f07998de71e|user-79 |product-369, 3|50003  |
+----------+------------------------------------+--------+--------------+-------+



In [6]:
df = spark.sql("""
select event, messageid, userid,
		regexp_replace(substr(lineitems , 0, instr(lineitems, ',')), ",", "") as productid,
		substr(lineitems , instr(lineitems, ',')+2, length(lineitems)) as quantity,
		orderid
from orders o
""")
df.show(5, truncate=False)

+----------+------------------------------------+--------+------------+--------+-------+
|event     |messageid                           |userid  |productid   |quantity|orderid|
+----------+------------------------------------+--------+------------+--------+-------+
|OrderEvent|2db62eb5-de95-4ce8-8161-ab7552dc2fd7|user-346|product-784,|3       |50000  |
|OrderEvent|2db62eb5-de95-4ce8-8161-ab7552dc2fd7|user-346|product-173,|1       |50000  |
|OrderEvent|f05643d0-29e7-47ca-a1b3-12ba3e8642e3|user-391|product-424,|1       |50001  |
|OrderEvent|30d5cb63-63bb-42a2-b27a-673f3a4e7925|user-120|product-393,|3       |50002  |
|OrderEvent|60c21fb3-a115-4f10-b595-6f07998de71e|user-79 |product-369,|3       |50003  |
+----------+------------------------------------+--------+------------+--------+-------+
only showing top 5 rows



In [24]:
from pyspark.sql.functions import regexp_replace

In [23]:
df = spark.sql("""
WITH new_orders
AS (select event, messageid, userid,
		regexp_replace(substr(lineitems , 0, instr(lineitems, ',')), ",", "") as productid,
		substr(lineitems , instr(lineitems, ',')+2, length(lineitems)) as quantity,
		orderid
	from orders o)
select productid, count(distinct userid) uniqueuserboughts from new_orders no group by productid order by uniqueuserboughts
""").show(5, truncate=False)

+-----------+-----------------+
|productid  |uniqueuserboughts|
+-----------+-----------------+
|product-902|1                |
|product-944|1                |
|product-960|1                |
|product-952|1                |
|product-896|1                |
+-----------+-----------------+
only showing top 5 rows



In [25]:
df = spark.sql("""
WITH new_orders
AS (select event, messageid, userid,
		regexp_replace(substr(lineitems , 0, instr(lineitems, ',')), ",", "") as productid,
		substr(lineitems , instr(lineitems, ',')+2, length(lineitems)) as quantity,
		orderid
	from orders o)
select no.productid, count(distinct userid) uniqueuserboughts, pcm.categoryid
from new_orders no
inner join product_category_map pcm
on no.productid=pcm.productid
group by no.productid, pcm.categoryid order by pcm.categoryid, uniqueuserboughts desc
""").show(5, truncate=False)

+-----------+-----------------+----------+
|productid  |uniqueuserboughts|categoryid|
+-----------+-----------------+----------+
|product-158|59               |category-1|
|product-45 |58               |category-1|
|product-72 |54               |category-1|
|product-35 |54               |category-1|
|product-30 |53               |category-1|
+-----------+-----------------+----------+
only showing top 5 rows



In [26]:
df = spark.sql("""
select * from (
WITH new_orders
AS (select event, messageid, userid,
		regexp_replace(substr(lineitems , 0, instr(lineitems, ',')), ",", "") as productid,
		substr(lineitems , instr(lineitems, ',')+2, length(lineitems)) as quantity,
		orderid
	from orders o)
select no.productid, count(distinct userid) uniqueuserboughts, pcm.categoryid,
	ROW_NUMBER() over (Partition BY pcm.categoryid order by count(distinct userid) desc) AS rn
from new_orders no
inner join product_category_map pcm
on no.productid=pcm.productid
group by no.productid,  pcm.categoryid order by pcm.categoryid, uniqueuserboughts desc) q
where rn between 1 and 10
""").show(5, truncate=False)

+-----------+-----------------+----------+---+
|productid  |uniqueuserboughts|categoryid|rn |
+-----------+-----------------+----------+---+
|product-158|59               |category-1|1  |
|product-45 |58               |category-1|2  |
|product-72 |54               |category-1|4  |
|product-35 |54               |category-1|3  |
|product-30 |53               |category-1|5  |
+-----------+-----------------+----------+---+
only showing top 5 rows



In [28]:
df = spark.sql("""
select * from (
WITH new_orders
AS (select event, messageid, userid,
		regexp_replace(substr(lineitems , 0, instr(lineitems, ',')), ",", "") as productid,
		substr(lineitems , instr(lineitems, ',')+2, length(lineitems)) as quantity,
		orderid
	from orders o)
select no.productid, count(distinct userid) uniqueuserboughts, pcm.categoryid,
	ROW_NUMBER() over (Partition BY pcm.categoryid order by count(distinct userid) desc) AS rn
from new_orders no
inner join product_category_map pcm
on no.productid=pcm.productid
group by no.productid,  pcm.categoryid order by pcm.categoryid, uniqueuserboughts desc) q
where rn between 1 and 10
""").show(5, truncate=False)

+-----------+-----------------+----------+---+
|productid  |uniqueuserboughts|categoryid|rn |
+-----------+-----------------+----------+---+
|product-158|59               |category-1|1  |
|product-45 |58               |category-1|2  |
|product-72 |54               |category-1|4  |
|product-35 |54               |category-1|3  |
|product-30 |53               |category-1|5  |
+-----------+-----------------+----------+---+
only showing top 5 rows

