# Task I

* Create parquet dataset with the structure user_id, tag_info, where tag_info is array of structs and each struct has two subfields: tag and its frequency. 
    * This frequency measures how many times the user answered a question with this tag. 
    * Also sort the tags in the array according to the frequency in desc order
* This notebook prepares data for the analytical-app ntb.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, explode, count, struct, collect_list, array_sort, reverse, array, lit, desc, broadcast, slice
)

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('ETL II')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

users_with_tag_output_path = os.path.join(project_path, 'output/users_with_tag')

answers_input_path = os.path.join(project_path, 'data/answers')

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
)

questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

#### For each user get list of his/her tag_info

Hint:
* get this info from questions he/she answered
* join questions with answers to see tags of answered questions for each user
* explode tags, groupBy user and tag to get the frequency of each tag for each user
* use struct function to relate tag with its frequency
* groupBy user and use collect_list to get the tag_info for each user
* use functions array_sort and reverse to get the order

In [None]:
users_with_tags = (
    questionsDF.alias('q')
    .join(answersDF.alias('a'), 'question_id')
    .filter(col('a.user_id').isNotNull())
    .filter(col('q.tags').isNotNull())
    .select(
        col('a.user_id').alias('user_id'),
        explode('q.tags').alias('tag')
    )
    .groupBy('user_id', 'tag')
    .agg(
        count('*').alias('frequency')
    )
    .select('user_id', struct('frequency', 'tag').alias('tag_info'))
    .groupBy('user_id')
    .agg(
        collect_list('tag_info').alias('tag_info')
    )
    .select('user_id', reverse(array_sort('tag_info')).alias('tag_info'))
)

In [None]:
users_with_tags.show(truncate=80, n=5)

In [None]:
users_with_tags.count()

In [None]:
(
    users_with_tags
    .repartition(4)
    .write
    .mode('overwrite')
    .option('path', users_with_tag_output_path)
    .save()
)

In [None]:
spark.stop()