# Task I - naive recomendation system

* Suppose you will get a new question with some tags and you want to find a list of relevant users that are likely to answer question with these tags.  
* Use the information about users that you saved in ETL-III ntb.
* Create a function that takes as input tags from the new question and returns a list of n relevant users.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, explode, count, struct, collect_list, array_sort, reverse, array, lit, desc, broadcast, slice, sum, row_number
)

from pyspark.sql import Window
import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Naive RS')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

users_with_tag_output_path = os.path.join(project_path, 'output/users_with_tag')

In [None]:
# local_tags = answersDF.select('tags').take(1)[0]['tags']

In [None]:
local_tags = [
  'homework-and-exercises',
  'special-relativity',
  'field-theory',
  'lorentz-symmetry',
  'phase-space'
]

In [None]:
users_with_tags = (    
    spark
    .read
    .option('path', users_with_tag_output_path)
    .load()
).cache()

questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

#### Find relevant users

Hint:
* Compute relevance score
* Add the local_tags as a new column to users_with_tags
* Compare the arrays using HOFs (keep only tags that are in the request)
* Sum up the frequencies to get the relevance score
* Sort by relevancy and keep n top users
* Collect them to an array and sort the array using array_sort and reverse

In [None]:
(
    users_with_tags
    .withColumn('b', array([lit(x) for x in local_tags]))
    .selectExpr(
        'user_id',
        "FILTER(tag_info, x -> array_contains(b, x.tag)) AS new_tag"
    )
    .withColumn('tag_frequencies', col('new_tag.frequency'))
    .selectExpr(
        'user_id',
        "AGGREGATE(tag_frequencies, CAST(0 AS long), (value, buffer) -> value + buffer) AS question_relevancy"
    )
    .orderBy(desc('question_relevancy'))
    .withColumn('users', struct('question_relevancy', 'user_id'))
    .limit(10)
    .agg(collect_list('users').alias('users'))
    .withColumn('users', reverse(array_sort('users')))
    .withColumn('users', col('users.user_id'))
).show(truncate=80)

#### Define a function

Hint:
* this function should 
    * take as input usrs_with_tags, tags, n_users
    * return a dataframe with n top users sorted by the relevance

In [None]:
def get_relevant_users(users_with_tags, tags, n_users):
    return (
        users_with_tags
        .withColumn('tags', array([lit(x) for x in tags]))
        .selectExpr(
            'user_id',
            "FILTER(tag_info, x -> array_contains(tags, x.tag)) AS new_tag"
        )
        .withColumn('tag_frequencies', col('new_tag.frequency'))
        .selectExpr(
            'user_id',
            "AGGREGATE(tag_frequencies, CAST(0 AS long), (value, buffer) -> value + buffer) AS question_relevancy"
        )
        .orderBy(desc('question_relevancy'))
        .withColumn('users', struct('question_relevancy', 'user_id'))
        .limit(n_users)
        .agg(collect_list('users').alias('users'))
        .withColumn('users', reverse(array_sort('users')))
        .withColumn('users', col('users.user_id'))
    )

In [None]:
get_relevant_users(users_with_tags, local_tags, 4).show(truncate=90)

#### Task II

* Now assume that question tags are comming as a dataframe
* Modify the query so it efficiently finds users for each question

In [None]:
tagsDF = questionsDF.select('question_id', 'tags').limit(5)

In [None]:
tagsDF.show(truncate=False)

#### Transform users_with_tags

* explode tag_info
* add a new column tag on which you will join

In [None]:
users_tag = (
    users_with_tags
    .withColumn('tag_info', explode('tag_info'))
    .withColumn('tag_freq', col('tag_info.frequency'))
    .withColumn('tag', col('tag_info.tag'))
).cache()

In [None]:
users_tag.show(n=5)

#### Get top n users

Hint:
* use two techniques:
    1. use row_number as a window function to filter only for n top users
    2. use collect the users as struct with relevance and user_id and use array_sort, reverse, slice to get top n

In [None]:
# First technique:

n = 5

(
    tagsDF
    .withColumn('tag', explode('tags'))
    .join(users_tag, 'tag')
    .groupBy('question_id', 'user_id')
    .agg(
        sum('tag_freq').alias('relevance')
    )
    .withColumn('r', row_number().over(Window().partitionBy('question_id').orderBy(desc('relevance'))))
    .filter(col('r') <= n)
    .groupBy('question_id')
    .agg(
        collect_list(struct('relevance', 'user_id')).alias('users')
    )
    .withColumn('users', reverse(array_sort('users')))
    .withColumn('users', col('users.user_id'))
    .orderBy('question_id')
).show(truncate=80)

In [None]:
# Second technique:

(
    tagsDF
    .withColumn('tag', explode('tags'))
    .join(users_tag, 'tag')
    .groupBy('question_id', 'user_id')
    .agg(
        sum('tag_freq').alias('relevance')
    )
    .groupBy('question_id')
    .agg(
        collect_list(struct('relevance', 'user_id')).alias('users')
    )
    .withColumn('users', reverse(array_sort('users')))
    .withColumn('users', col('users.user_id'))
    .withColumn('users', slice('users', 1, n))
    .orderBy('question_id')
).show(truncate=80)

In [None]:
spark.stop()