# Task I

* Suppose you will get a new question with some tags and you want to find a list of relevant users that are likely to answer question with these tags.  Use the information about users that you saved in Analytical-application-I ntb.
* Create a function that takes as input tags from the new question and returns a list of n relevant users.

In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, explode, count, struct, collect_list, array_sort, reverse, array, lit, desc, broadcast, slice
)

import os

In [55]:
spark = (
    SparkSession
    .builder
    .appName('Analytical app')
    .getOrCreate()
)

In [64]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

users_with_tag_output_path = os.path.join(project_path, 'output/users_with_tag')

answers_input_path = os.path.join(project_path, 'data/answers')

In [57]:
#local_tags = answersDF.select('tags').take(1)[0]['tags']

In [58]:
local_tags = [
  'homework-and-exercises',
  'special-relativity',
  'field-theory',
  'lorentz-symmetry',
  'phase-space'
]

In [59]:
users_with_tags = (    
    spark
    .read
    .option('path', users_with_tag_output_path)
    .load()
).cache()

In [60]:
question = answersDF.select('question_id', 'tags').limit(5)

In [61]:
(
    question
    .crossJoin(broadcast(users_with_tags))
    .withColumnRenamed('tags', 'b')
    .selectExpr(
        'question_id',
        'user_id',
        "FILTER(tag_info, x -> array_contains(b, x.tag)) AS new_tag"
    )
    .withColumn('tag_frequencies', col('new_tag.frequency'))
    .selectExpr(
        'question_id',
        'user_id',
        "AGGREGATE(tag_frequencies, CAST(0 AS long), (value, buffer) -> value + buffer) AS question_relevancy"
    )
    .orderBy(desc('question_relevancy'))
    .withColumn('users', struct('question_relevancy', 'user_id'))
    .groupBy('question_id')
    .agg(collect_list('users').alias('users'))
    .withColumn('users', reverse(array_sort('users')))
    .withColumn('users', col('users.user_id'))
).show(truncate=80)

+-----------+--------------------------------------------------------------------------------+
|question_id|                                                                           users|
+-----------+--------------------------------------------------------------------------------+
|      32954|[36793, 70392, 21146, 20564, 21487, 4521, 75628, 140141, 75085, 26686, 17338,...|
|      65848|[21146, 36793, 70392, 4521, 42337, 6764, 540, 8563, 23517, 4435, 75628, 37677...|
|     167813|[36793, 112190, 6764, 21146, 70392, 25664, 5841, 167720, 204345, 12615, 59234...|
|     121997|[36793, 70392, 20564, 26686, 21487, 17338, 140141, 164488, 12813, 24452, 7896...|
|     360185|[112190, 5841, 70392, 21146, 45613, 75085, 117629, 103052, 27753, 148704, 110...|
+-----------+--------------------------------------------------------------------------------+



In [62]:
def get_relevant_users(users_with_tags, question, n_users):
    return (
        question
        .crossJoin(broadcast(users_with_tags))
        .withColumnRenamed('tags', 'b')
        .selectExpr(
            'question_id',
            'user_id',
            "FILTER(tag_info, x -> array_contains(b, x.tag)) AS new_tag"
        )
        .withColumn('tag_frequencies', col('new_tag.frequency'))
        .selectExpr(
            'question_id',
            'user_id',
            "AGGREGATE(tag_frequencies, CAST(0 AS long), (value, buffer) -> value + buffer) AS question_relevancy"
        )
        .orderBy(desc('question_relevancy'))
        .withColumn('users', struct('question_relevancy', 'user_id'))
        .groupBy('question_id')
        .agg(collect_list('users').alias('users'))
        .withColumn('users', reverse(array_sort('users')))
        .withColumn('users', col('users.user_id'))
        .withColumn('users', slice('users', 1, n_users))
    )

In [63]:
get_relevant_users(users_with_tags, question, 4).show(truncate=70)

+-----------+----------------------------+
|question_id|                       users|
+-----------+----------------------------+
|      32954|[36793, 70392, 21146, 20564]|
|      65848| [21146, 36793, 70392, 4521]|
|     167813|[36793, 112190, 6764, 21146]|
|     121997|[36793, 70392, 20564, 26686]|
|     360185|[112190, 5841, 70392, 21146]|
+-----------+----------------------------+



In [65]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
)

questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

In [76]:
(
    questionsDF.alias('q')
    .join(answersDF.alias('a'), 'question_id')
    .filter(col('a.user_id').isNotNull())
    .filter(col('q.tags').isNotNull())
    .select(
        #'q.question_id',
        #col('a.user_id').alias('user_id'),
         'q.tags',
       
    )
    .filter(col('question_id') == 32954)
).show(truncate=100)

+------------------------------------------------------------------------------------------+
|                                                                                      tags|
+------------------------------------------------------------------------------------------+
|[quantum-mechanics, condensed-matter, solid-state-physics, electric-circuits, nanoscience]|
+------------------------------------------------------------------------------------------+



In [77]:
(
    questionsDF.alias('q')
    .join(answersDF.alias('a'), 'question_id')
    .filter(col('a.user_id') == 70392)
    .select(
        explode('tags').alias('tag')
    )
    .groupBy('tag')
    .agg(count('*').alias('cnt'))
    .orderBy(desc('cnt'))
).show(truncate=90, n=30)

+----------------------+---+
|                   tag|cnt|
+----------------------+---+
|     quantum-mechanics| 13|
|homework-and-exercises| 12|
|      electromagnetism| 11|
|                optics|  7|
|   newtonian-mechanics|  7|
|      condensed-matter|  6|
|    special-relativity|  6|
|      particle-physics|  5|
| statistical-mechanics|  5|
|       non-equilibrium|  5|
|           terminology|  4|
|  dimensional-analysis|  4|
|             operators|  4|
|                energy|  4|
|            definition|  4|
|        thermodynamics|  4|
|       inertial-frames|  3|
|      electric-current|  3|
|                 waves|  3|
|  quantum-field-theory|  3|
|         hilbert-space|  3|
|      reference-frames|  3|
|                  work|  3|
|                forces|  3|
|   quantum-information|  3|
|          gauge-theory|  2|
|       magnetic-fields|  2|
|             diffusion|  2|
|         path-integral|  2|
|             potential|  2|
+----------------------+---+
only showing t