# Task I

* Suppose you will get a new question with some tags and you want to find a list of relevant users that are likely to answer question with these tags.  Use the information about users that you saved in Analytical-application-I ntb.
* Create a function that takes as input tags from the new question and returns a list of n relevant users.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, explode, count, struct, collect_list, array_sort, reverse, array, lit, desc, broadcast, slice
)

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Analytical app')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

answers_input_path = os.path.join(project_path, 'output/questions-transformed')

users_with_tag_output_path = os.path.join(project_path, 'output/users_with_tag')

In [None]:
#local_tags = answersDF.select('tags').take(1)[0]['tags']

In [None]:
local_tags = [
  'homework-and-exercises',
  'special-relativity',
  'field-theory',
  'lorentz-symmetry',
  'phase-space'
]

In [None]:
users_with_tags = (    
    spark
    .read
    .option('path', users_with_tag_output_path)
    .load()
).cache()

In [None]:
question = answersDF.select('question_id', 'tags').limit(5)

In [None]:
(
    question
    .crossJoin(broadcast(users_with_tags))
    .withColumnRenamed('tags', 'b')
    .selectExpr(
        'question_id',
        'user_id',
        "FILTER(tag_info, x -> array_contains(b, x.tag)) AS new_tag"
    )
    .withColumn('tag_frequencies', col('new_tag.frequency'))
    .selectExpr(
        'question_id',
        'user_id',
        "AGGREGATE(tag_frequencies, CAST(0 AS long), (value, buffer) -> value + buffer) AS question_relevancy"
    )
    .orderBy(desc('question_relevancy'))
    .withColumn('users', struct('question_relevancy', 'user_id'))
    .groupBy('question_id')
    .agg(collect_list('users').alias('users'))
    .withColumn('users', reverse(array_sort('users')))
    .withColumn('users', col('users.user_id'))
).show(truncate=80)

In [None]:
def get_relevant_users(users_with_tags, question, n_users):
    return (
        question
        .crossJoin(broadcast(users_with_tags))
        .withColumnRenamed('tags', 'b')
        .selectExpr(
            'question_id',
            'user_id',
            "FILTER(tag_info, x -> array_contains(b, x.tag)) AS new_tag"
        )
        .withColumn('tag_frequencies', col('new_tag.frequency'))
        .selectExpr(
            'question_id',
            'user_id',
            "AGGREGATE(tag_frequencies, CAST(0 AS long), (value, buffer) -> value + buffer) AS question_relevancy"
        )
        .orderBy(desc('question_relevancy'))
        .withColumn('users', struct('question_relevancy', 'user_id'))
        .groupBy('question_id')
        .agg(collect_list('users').alias('users'))
        .withColumn('users', reverse(array_sort('users')))
        .withColumn('users', col('users.user_id'))
        .withColumn('users', slice('users', 1, n_users))
    )

In [None]:
get_relevant_users(users_with_tags, question, 4).show(truncate=70)