In [1]:
"""
Requirement2: 
Sampling the data. This case, we get the filteredRDD which meet the criteria, and we pick 100 records
out of them. 

The method is: To calculate the ratio of every hours, then the amount of every hour = 100 * ratio.

Just need to take care we need to pick them randomly

Feature:

    - Random Method
    - Pyspark Broadcast Variable: in production, the map(hour -> index) could be big, if not broadcast it, there
    will be duplicate in all the partition. Which lead to performance and memory issue potentially
    - flatmap to deal with the iterable
"""

'\nRequirement2: \nSampling the data. This case, we get the filteredRDD which meet the criteria, and we pick 100 records\nout of them. \n\nThe method is: To calculate the ratio of every hours, then the amount of every hour = 100 * ratio.\n\nJust need to take care we need to pick them randomly\n\nFeature:\n\n    - Random Method\n    - Pyspark Broadcast Variable: in production, the map(hour -> index) could be big, if not broadcast it, there\n    will be duplicate in all the partition. Which lead to performance and memory issue potentially\n    - flatmap to deal with the iterable\n'

In [2]:
sc

In [3]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, HiveContext
import uuid

sparkSession = (SparkSession
                    .builder
                    .appName('example-pyspark-read-and-write-from-hive')
                    .config("hive.metastore.uris", "thrift://localhost:9083", conf=SparkConf())
                    .enableHiveSupport()
                    .getOrCreate()
                    )

In [4]:
sparkSession

In [5]:
user_action_rdd = sparkSession.sql("select * from user_visit_action_table").rdd

In [6]:
from utils.spark_utils import raw_data_to_filtered_full_info

filtered_rdd = raw_data_to_filtered_full_info(sparkSession, user_action_rdd, sc)

In [7]:
# operations
from utils.string_utils import get_value_from_session_user_aggr_info
from collections import defaultdict
import random
def get_start_time_to_info_rdd(x):
    start_time = get_value_from_session_user_aggr_info('start_time', x[1])
    day = start_time.split(' ')[0]
    hour = start_time.split(' ')[1].split(':')[0]
    date_hour = day + '_' + hour
    return (date_hour, x[1])

# generate a list of index to be extracted
def generate_list_of_extract_index_by_map(hour_map, total):
    sum = 0
    
    for val in hour_map.values():
        sum = sum + val
    
    hour_to_extract_num = defaultdict(int)
    
    """
    this is for key -> (ratio * total)
    """
    for key in hour_map.keys():
        hour_to_extract_num[key] = int(hour_map[key] / sum * total)
    
    """
    final map is to generate a index:
    
    random extract(range:0 - hour_map[key], how_many:hour_to_entract_num[key])
    """
    
    hour_to_extract_index = {}
    for key in hour_map.keys():
        hour_to_extract_index[key] = generate_index_by_range_and_amount(hour_map[key], hour_to_extract_num[key])
    
    return hour_to_extract_index

def generate_index_by_range_and_amount(total_amount, extract_amount):
    return random.sample(range(0, total_amount - 1), extract_amount)

    
# generate_index_by_range_and_amount(10,4)

In [8]:
filtered_rdd.collect()

[('a57406b6603b11eb8e9facde48001122',
  'session_id=a57406b6603b11eb8e9facde48001122|search_keywords=Lamer,Lobsters,Facial Lotions,Vacuum,Mugs,Apple,Napkins,Machine Learning,Lenovo Laptops|click_categories=40,53,7,34,49,25,30,24,23,36,12,90,46,15,20,18,35|visit_length=3586|step_length=77|start_time=2021-01-27 13:00:07|age=32|professional=professional19|sex=Female|city=city28'),
 ('a5742178603b11eb8204acde48001122',
  'session_id=a5742178603b11eb8204acde48001122|search_keywords=Apple,Lenovo Laptops,Mugs,Facial Lotions,Vacuum|click_categories=51,95,34,48,7,80,88,19|visit_length=3234|step_length=33|start_time=2021-01-27 3:01:21|age=32|professional=professional19|sex=Female|city=city28'),
 ('a574fa80603b11ebb8bbacde48001122',
  'session_id=a574fa80603b11ebb8bbacde48001122|search_keywords=Facial Lotions,Lamer,Machine Learning,Vacuum,Mugs,Apple|click_categories=35,73,13,2,61,75,29,37,33,63|visit_length=3243|step_length=46|start_time=2021-01-27 1:00:04|age=32|professional=professional19|sex=F

In [10]:
from utils.data_model import user_action_sampling
def sampling_the_data(sparksession, task_uuid, filtered_rdd, sc):
    
    def flat_by_index(action_group):
        
        # get the key for the map: day_hour
        index = 0
        
        flat_array = []
        for action in action_group[1]:
            
            start_time = get_value_from_session_user_aggr_info('start_time', action)
            session_id = get_value_from_session_user_aggr_info('session_id', action)
            search_keywords = get_value_from_session_user_aggr_info('search_keywords', action)
            click_categories = get_value_from_session_user_aggr_info('click_categories', action)
            
            day = start_time.split(' ')[0]
            hour = start_time.split(' ')[1].split(':')[0]
            date_hour = day + '_' + hour
            
            if index in hour_to_extract_index.value[date_hour]:
                flat_array.append(user_action_sampling(task_uuid, session_id, start_time, search_keywords, click_categories))
                index = index + 1
        
        return flat_array
    
    # change the key
    start_time_to_info_rdd = filtered_rdd.map(lambda x : get_start_time_to_info_rdd(x))
    
    # 
    hour_count = start_time_to_info_rdd.countByKey()
    
    # num to be extract map, broad cast it out
    hour_to_extract_index = sc.broadcast(generate_list_of_extract_index_by_map(hour_count, 100))
    
    sessionid_to_group_actions = filtered_rdd.groupByKey()
    
    extracted_action = sessionid_to_group_actions.flatMap(lambda x : flat_by_index(x))
    return extracted_action
    
#     filtered_rdd.flatMap(lambda x : flat_by_index(x))
#     return hour_to_extract

# task_uuid = 
# sampling_the_data(sparkSession, '1', filtered_rdd, sc).collect()
# hour_to_extract

[user_action_sampling(task_id='1', session_id='a5555568603b11eba45aacde48001122', start_time='2021-01-27 18:00:23', search_keywords='Napkins,Lamer,Lobsters,Facial Lotions,Apple,Machine Learning,Lenovo Laptops,Mugs,Vacuum', click_categories='44,24,80,71,98,84,10,45,38,94,29,65,30,53,41,58,89'),
 user_action_sampling(task_id='1', session_id='a5e70f08603b11ebb45aacde48001122', start_time='2021-01-27 5:00:11', search_keywords='Huawei Cell Phone,Napkins,Lenovo Laptops,Vacuum,Lobsters,Facial Lotions', click_categories='11,14,47,56,19,75,84,26,81,69,3,36,32,88'),
 user_action_sampling(task_id='1', session_id='a5b25a2e603b11eb8cccacde48001122', start_time='2021-01-27 0:00:45', search_keywords='Lenovo Laptops,Facial Lotions,Machine Learning,Mugs,Lobsters,Huawei Cell Phone,Vacuum', click_categories='69,83,4,15,72,63'),
 user_action_sampling(task_id='1', session_id='a54e218a603b11eb8987acde48001122', start_time='2021-01-27 17:00:26', search_keywords='Huawei Cell Phone,Lobsters,Facial Lotions,Napk

In [14]:
import uuid
task_uuid = uuid.uuid1().__str__()
sample_data_list = sampling_the_data(sparkSession, task_uuid, filtered_rdd, sc).collect()

In [15]:
sample_data_list

[user_action_sampling(task_id='eed27d7e-6c84-11eb-8cdf-acde48001122', session_id='a577e4b6603b11ebae4bacde48001122', start_time='2021-01-27 23:00:04', search_keywords='Lenovo Laptops,Mugs,Lobsters,Machine Learning,Napkins,Apple,Facial Lotions,Lamer,Vacuum', click_categories='2,74,17,46,86,47,98,3,22,78,28,89,14,10,48,95,20,44,90,77'),
 user_action_sampling(task_id='eed27d7e-6c84-11eb-8cdf-acde48001122', session_id='a57ac794603b11eb8d73acde48001122', start_time='2021-01-27 23:04:58', search_keywords='Facial Lotions,Lamer,Lenovo Laptops,Vacuum,Machine Learning', click_categories='14,32,19,73,77,81,37,90,45,93'),
 user_action_sampling(task_id='eed27d7e-6c84-11eb-8cdf-acde48001122', session_id='a56d4376603b11eb9c5eacde48001122', start_time='2021-01-27 1:00:44', search_keywords='Apple,Facial Lotions,Machine Learning,Lobsters', click_categories='65,78,49,87'),
 user_action_sampling(task_id='eed27d7e-6c84-11eb-8cdf-acde48001122', session_id='a5528cc0603b11eb9364acde48001122', start_time='2021

In [18]:
from utils.spark_utils import insertHive
sample_df = sparkSession.createDataFrame(sample_data_list)
insertHive(sparkSession, 'session_sampling', sample_df)

In [19]:
# to see the result

In [20]:
sparkSession.sql("select * from session_sampling").rdd.collect()

[Row(task_id='eed27d7e-6c84-11eb-8cdf-acde48001122', session_id='a569313a603b11ebb4ecacde48001122', start_time='2021-01-27 1:01:09', search_keywords='Lenovo Laptops,Vacuum,Machine Learning,Huawei Cell Phone,Lamer,Facial Lotions,Mugs,Napkins,Lobsters', click_categories='27,67,37,87,80,94,82,29,70'),
 Row(task_id='eed27d7e-6c84-11eb-8cdf-acde48001122', session_id='a5ae805c603b11eb8137acde48001122', start_time='2021-01-27 23:05:51', search_keywords='Lenovo Laptops,Facial Lotions', click_categories='48,77,36,47,13'),
 Row(task_id='eed27d7e-6c84-11eb-8cdf-acde48001122', session_id='a59e2202603b11eba2d9acde48001122', start_time='2021-01-27 23:06:31', search_keywords='Napkins,Apple,Lobsters,Vacuum', click_categories='47,94,32,15'),
 Row(task_id='eed27d7e-6c84-11eb-8cdf-acde48001122', session_id='a5755890603b11eb9dfaacde48001122', start_time='2021-01-27 16:00:11', search_keywords='Vacuum,Huawei Cell Phone,Apple,Mugs,Napkins,Lenovo Laptops,Facial Lotions,Lamer,Machine Learning', click_categorie