# Task I

* Create parquet dataset with the structure user_id, tag_info, where tag_info is array of structs and each struct has two subfields: tag and its frequency. This frequency measures how many times answered the user question with this tag. Also sort the tags in the array according to the frequency in desc order

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, explode, count, struct, collect_list, array_sort, reverse, array, lit, desc
)

import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('Analytical app')
    .getOrCreate()
)

In [5]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

answers_input_path = os.path.join(project_path, 'output/questions-transformed')

users_with_tag_output_path = os.path.join(project_path, 'output/users_with_tag')

In [7]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
)

In [8]:
users_with_tags = (
    answersDF
    .filter(col('user_id').isNotNull())
    .filter(col('tags').isNotNull())
    .select('user_id', 'tags')
    .select('user_id', explode('tags').alias('tag'))
    .groupBy('user_id', 'tag')
    .agg(
        count('*').alias('frequency')
    )
    .select('user_id', struct('frequency', 'tag').alias('tag_info'))
    .groupBy('user_id')
    .agg(
        collect_list('tag_info').alias('tag_info')
    )
    .select('user_id', reverse(array_sort('tag_info')).alias('tag_info'))
)

In [9]:
(
    users_with_tags
    .repartition(4)
    .write
    .mode('overwrite')
    .option('path', users_with_tag_output_path)
    .save()
)

# Task II

* Suppose you will get a new question with some tags and you want to find a list of relevant users that are likely to answer question with these tags.  Use the information about users that you saved above.
* Create a function that takes as input tags from the new question and returns a list of n relevant users.

In [None]:
#local_tags = answersDF.select('tags').take(1)[0]['tags']

In [None]:
new_question_tags = [
  'homework-and-exercises',
  'special-relativity',
  'field-theory',
  'lorentz-symmetry',
  'phase-space'
]

In [None]:
users_with_tags = (    
    spark
    .read
    .option('path', users_with_tag_output_path)
    .load()
).cache()

In [None]:
(
    users_with_tags
    .withColumn('b', array(list(map(lambda x: lit(x), local_tags))))
    .selectExpr(
        'user_id',
        "FILTER(tag_info, x -> array_contains(b, x.tag)) AS new_tag"
    )
    .withColumn('tag_frequencies', col('new_tag.frequency'))
    .selectExpr(
        'user_id',
        "AGGREGATE(tag_frequencies, CAST(0 AS long), (value, buffer) -> value + buffer) AS question_relevancy"
    )
    .orderBy(desc('question_relevancy'))
).show()

In [None]:
def get_relevant_users(users_with_tags, tags, n_users):
    return (
        users_with_tags
        .withColumn('b', array(list(map(lambda x: lit(x), tags))))
        .selectExpr(
            'user_id',
            "FILTER(tag_info, x -> array_contains(b, x.tag)) AS new_tag"
        )
        .withColumn('tag_frequencies', col('new_tag.frequency'))
        .selectExpr(
            'user_id',
            "AGGREGATE(tag_frequencies, CAST(0 AS long), (value, buffer) -> value + buffer) AS question_relevancy"
        )
        .orderBy(desc('question_relevancy'))
        .limit(n_users)
    )

In [None]:
get_relevant_users(users_with_tags, new_question_tags, 10).show()