In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("question1").getOrCreate()

In [2]:
from pyspark.sql.functions import col, translate, explode


product_views = spark.read.json(r"data\product-views.json")
orders = spark.read.json(r"data\orders.json")
product_category_map = spark.read.csv(r"data\product-category-map.csv", header=True)

product_views = product_views.withColumn("event", col("event")) \
            .withColumn("messageid", col("messageid")) \
            .withColumn("userid", col("userid")) \
            .withColumn("productid", translate(col("properties").cast("string"), "[]","")) \
            .withColumn("source", translate(col("context").cast("string"), "[]", "")) \
            .drop("context").drop("properties")

orders = orders.select(orders.event,
                       orders.messageid,
                       orders.userid,
                       explode(orders.lineitems).alias("lineitems"),
                       orders.orderid) \
                .withColumn("lineitems", translate(col("lineitems").cast("string"), "[]", ""))

product_views.createOrReplaceTempView("product_views")
orders.createOrReplaceTempView("orders")
product_category_map.createOrReplaceTempView("product_category_map")

In [3]:
spark.sql("select * from product_category_map limit 5").show()

+---------+-----------+
|productid| categoryid|
+---------+-----------+
|product-1|category-20|
|product-2| category-8|
|product-3| category-4|
|product-4| category-6|
|product-5| category-1|
+---------+-----------+



In [7]:
df = spark.sql("""
select * from
(select pv.productid, count(distinct userid) uniqueuserviews, pcm.categoryid,
		ROW_NUMBER() over (Partition BY pcm.categoryid order by count(distinct userid) desc) AS rn
from product_views pv
inner join product_category_map pcm
on pv.productid=pcm.productid
group by pv.productid, pcm.categoryid order by pcm.categoryid, uniqueuserviews desc)q
where rn between 1 and 10
""")
df.show(truncate=False)

+-----------+---------------+-----------+---+
|productid  |uniqueuserviews|categoryid |rn |
+-----------+---------------+-----------+---+
|product-132|166            |category-1 |1  |
|product-125|157            |category-1 |2  |
|product-35 |156            |category-1 |3  |
|product-22 |155            |category-1 |4  |
|product-42 |155            |category-1 |5  |
|product-171|154            |category-1 |6  |
|product-195|152            |category-1 |7  |
|product-45 |150            |category-1 |8  |
|product-72 |149            |category-1 |9  |
|product-158|147            |category-1 |10 |
|product-61 |159            |category-10|1  |
|product-178|156            |category-10|2  |
|product-184|151            |category-10|3  |
|product-89 |148            |category-10|4  |
|product-12 |147            |category-10|5  |
|product-143|144            |category-10|6  |
|product-118|142            |category-10|7  |
|product-73 |140            |category-10|8  |
|product-9  |139            |categ

In [8]:
df.cache()

DataFrame[productid: string, uniqueuserviews: bigint, categoryid: string, rn: int]

In [18]:
df.show(5)

+-----------+---------------+----------+---+
|  productid|uniqueuserviews|categoryid| rn|
+-----------+---------------+----------+---+
|product-132|            166|category-1|  1|
|product-125|            157|category-1|  2|
| product-35|            156|category-1|  3|
| product-22|            155|category-1|  4|
| product-42|            155|category-1|  5|
+-----------+---------------+----------+---+
only showing top 5 rows



In [19]:
from pyspark.sql.functions import concat_ws

df.withColumn("deneme", concat_ws(",", col("categoryid"), col("rn"))).show(5)

+-----------+---------------+----------+---+------------+
|  productid|uniqueuserviews|categoryid| rn|      deneme|
+-----------+---------------+----------+---+------------+
|product-132|            166|category-1|  1|category-1,1|
|product-125|            157|category-1|  2|category-1,2|
| product-35|            156|category-1|  3|category-1,3|
| product-22|            155|category-1|  4|category-1,4|
| product-42|            155|category-1|  5|category-1,5|
+-----------+---------------+----------+---+------------+
only showing top 5 rows

