In [1]:
"""
To get the top10 most active session for the top10 popular category
To decide this we use click count
To get this,
    1. prepare the data we need here
        - filtered action rdd
        - the top10 category
    
    2. group by session_id
    3. count
    4. sort
"""

'\nTo get the top10 most active session for the top10 popular category\nTo decide this we use click count\nTo get this,\n    1. prepare the data we need here\n        - filtered action rdd\n        - the top10 category\n    \n    2. group by session_id\n    3. count\n    4. sort\n'

In [2]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, HiveContext
from utils.spark_utils import insertHive, raw_data_to_filtered_full_info
from utils.data_model import category_action_count, top10_category
import uuid

In [3]:
sparkSession = (SparkSession
                    .builder
                    .appName('example-pyspark-read-and-write-from-hive')
                    .config("hive.metastore.uris", "thrift://localhost:9083", conf=SparkConf())
                    .enableHiveSupport()
                    .getOrCreate()
                    )

In [4]:
task_id = uuid.uuid1().__str__()
task_id

'cf5559c0-6dac-11eb-9189-acde48001122'

In [9]:
def get_filtered_action_rdd(sparksession, user_action_rdd, sc):
    # pick out the session_id -> filtered action
    def map_full_action_to_action(x):
        (session_id, (action, full_info)) = x
        return (session_id, action)
    
    session_to_action_rdd = user_action_rdd.map(lambda x : (x.session_id, x))
    filtered_sessionid_to_full_info = raw_data_to_filtered_full_info(sparkSession, user_action_rdd, sc)
    session_id_filtered_action_rdd = session_to_action_rdd.join(filtered_sessionid_to_full_info)
    session_id_to_filtered_action_rdd = session_id_filtered_action_rdd.map(lambda x : map_full_action_to_action(x))
    return session_id_to_filtered_action_rdd

In [None]:
"""
Now:
(sessionid, group) -> (sessionid, how_many)
"""

In [22]:
# main func

def count_top_10_click_amount(x):
    session_id, action_group = x
    top_10_click_count = 0
    for action in action_group:
        if action.click_category_id in top10_categories:
            top_10_click_count = top_10_click_count + 1
    return (session_id, top_10_click_count)
    
# get the action table
user_action_rdd = sparkSession.sql("select * from user_visit_action_table").rdd

filtered_action_rdd = get_filtered_action_rdd(sparkSession, user_action_rdd, sc)

filtered_session_to_group_action_rdd = filtered_action_rdd.groupByKey()

top10_rdd = sparkSession.sql("select category_id from top10_categories").rdd.map(lambda x : x.category_id)

top10_categories = top10_rdd.collect()

filtered_session_top_click_count = filtered_session_to_group_action_rdd.map(lambda x : count_top_10_click_amount(x))

sorted_filtered_top10_click_count = filtered_session_top_click_count.sortBy(lambda x : x[1], ascending=False)

sorted_filtered_top10_click_count.collect()

[('a5555568603b11eba45aacde48001122', 7),
 ('a5a16494603b11eba17bacde48001122', 7),
 ('a567194c603b11eb81fdacde48001122', 7),
 ('a574511e603b11eb8d99acde48001122', 6),
 ('a5e0c9e2603b11eb983aacde48001122', 6),
 ('a561ff34603b11ebb263acde48001122', 6),
 ('a5b721c6603b11eb9565acde48001122', 6),
 ('a59dc302603b11eb92acacde48001122', 5),
 ('a5f1475c603b11ebb94cacde48001122', 5),
 ('a5a0aa0c603b11eb8c39acde48001122', 5),
 ('a5cab074603b11eba8c8acde48001122', 5),
 ('a5534926603b11eb9623acde48001122', 5),
 ('a5f03768603b11eb90b6acde48001122', 5),
 ('a55dd63e603b11eb86d8acde48001122', 5),
 ('a5ab2134603b11eb98dbacde48001122', 5),
 ('a5b8f034603b11eb96dbacde48001122', 5),
 ('a5678670603b11eb928bacde48001122', 5),
 ('a5a1033a603b11eb8e68acde48001122', 5),
 ('a5a99c0c603b11eb8e67acde48001122', 5),
 ('a5742d62603b11eb8287acde48001122', 5),
 ('a5b1ffa2603b11ebbcfeacde48001122', 5),
 ('a5da124c603b11eba979acde48001122', 5),
 ('a5777c38603b11eb8508acde48001122', 5),
 ('a5865dac603b11eba84aacde4800112

In [16]:
filtered_action_rdd.collect()

[('a553f092603b11eb997facde48001122',
  Row(date='2021-01-27', user_id=44, session_id='a553f092603b11eb997facde48001122', page_id=74, action_time='2021-01-27 23:57:10', search_keyword=None, click_category_id=9, click_product_id=13, order_category_ids=None, order_product_ids=None, pay_category_ids=None, pay_product_ids=None, city_id=10)),
 ('a553f092603b11eb997facde48001122',
  Row(date='2021-01-27', user_id=44, session_id='a553f092603b11eb997facde48001122', page_id=79, action_time='2021-01-27 23:24:03', search_keyword=None, click_category_id=-1, click_product_id=-1, order_category_ids='95', order_product_ids='17', pay_category_ids=None, pay_product_ids=None, city_id=1)),
 ('a553f092603b11eb997facde48001122',
  Row(date='2021-01-27', user_id=44, session_id='a553f092603b11eb997facde48001122', page_id=67, action_time='2021-01-27 23:38:15', search_keyword=None, click_category_id=95, click_product_id=36, order_category_ids=None, order_product_ids=None, pay_category_ids=None, pay_product_ids