# Higher Order Functions

In this notebook you will solve two questions using higher order functions

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, desc, count, explode, split, regexp_replace, collect_list, array_sort, reverse, unix_timestamp, expr
)

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('HOF I')
    .getOrCreate()
)

# Task I

* convert question tags (String in json file) to an array using HOFs

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/questions-json')

<b>Read the data from JSON:</b>

In [None]:
questionsDF = (
    spark
    .read
    .format('json')
    .option('path', data_input_path)
    .load()
)

<b>Transform tags:</b>

Hint:
* first split the string using to an array
 * use split
* use TRANSFORM in sql expression
 * use regexp_replace on each element

In [None]:
(
    questionsDF
    .withColumn('tags', split('tags', '><'))
    .selectExpr(
        '*',
        "TRANSFORM(tags, value -> regexp_replace(value, '(>|<)', '')) AS tags_arr"
    )
    .drop('tags')
    .withColumnRenamed('tags_arr', 'tags')
).show(truncate=7)

# Task II

* For each user concatenate titles of questions he answered to a single string

In [None]:
data_input_path = os.path.join(project_path, 'output/questions-transformed')

questionsDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
)

In [None]:
(
    questionsDF
    .groupBy('user_id')
    .agg(
        collect_list('title').alias('title')
    )
    .selectExpr(
        '*',
        "AGGREGATE(title, cast('' AS string), (buffer, value) -> (concat(buffer, ' - ', value))) AS total_title"
    )
    .withColumn('total_title', expr("substring(total_title, 4, length(total_title))"))
).show(truncate=50)

In [None]:
spark.stop()