# Higher Order Functions

In this notebook you will solve two questions using higher order functions

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, collect_list, expr, array_join

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('HOF I')
    .getOrCreate()
)

# Task I

* convert question tags (String in json file) to an array using HOFs

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

questions_json_input_path = os.path.join(project_path, 'data/questions-json')

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

#### Read the data from JSON:

In [None]:
questionsDF = (
    spark
    .read
    .format('json')
    .option('path', questions_json_input_path)
    .load()
)

#### Transform tags:

Hint:
* first split the string using to an array
 * use split
* use TRANSFORM in sql expression
 * use regexp_replace on each element

In [None]:
(
    questionsDF
    .withColumn('tags', split('tags', '><'))
    .selectExpr(
        '*',
        "TRANSFORM(tags, value -> regexp_replace(value, '(>|<)', '')) AS tags_arr"
    )
    .drop('tags')
    .withColumnRenamed('tags_arr', 'tags')
    .select('question_id', 'title', 'tags')
).show(truncate=30, n=10)

# Task II

* For each user concatenate titles of questions he answered to a single string using HOFs.
* First do it using HOFs
* Second do it using native function array_join

In [None]:
questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

#### Concat the titles:

Hint:
* collect the titles to an array for each user
 * use groupBy and collect_list
* use AGGREGATE in SQL expression to concat the array to a single string
* remove first 3 chars using substring

In [None]:
(
    questionsDF
    .groupBy('user_id')
    .agg(
        collect_list('title').alias('title')
    )
    .selectExpr(
        '*',
        "AGGREGATE(title, cast('' AS string), (buffer, value) -> (concat(buffer, ' - ', value))) AS total_title"
    )
    .withColumn('total_title', expr("substring(total_title, 4, length(total_title))"))
).show(truncate=50, n=10)

#### Do the same using array_join:

In [None]:
(
    questionsDF
    .groupBy('user_id')
    .agg(
        collect_list('title').alias('title')
    )
    .withColumn('total_title', array_join(col('title'), ' - '))
    .select('total_title')
).show(truncate=90, n=10)

In [None]:
spark.stop()