# Chapter 4 Higher-Order Functions
Christoph Windheuser    
May, 2022   
Python examples of chapter 4 Higher-Order Functions (page 141 ff) in the book *Learning Spark*

In [19]:
# Import required python spark libraries
import findspark
import pyspark

from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from pyspark.sql.types import *
from pyspark.sql.functions import col, expr, when, concat, lit, avg, desc
from pyspark.sql import SparkSession
from pyspark.sql import Row


In [2]:
# Connect Jupyter Notebook with the Spark application and create Spark Context
findspark.init()
sc = pyspark.SparkContext(appName="chapter_4_HOF")


In [5]:
#create a SparkSession

spark = (SparkSession \
         .builder \
         .enableHiveSupport() \
         .config("spark.sql.catalogImplementation","hive") \
         .appName("Chapter_4_HOF") \
         .getOrCreate())


# Define a DataFrame

In [7]:
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])


In [8]:
t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]


In [9]:
t_c = spark.createDataFrame(t_list, schema)

In [11]:
t_c.createOrReplaceTempView("tC")

In [12]:
t_c.show()

+--------------------+
|             celsius|
+--------------------+
|[35, 36, 32, 30, ...|
|[31, 32, 34, 55, 56]|
+--------------------+



# Using the Higher-Order Function *transform()*
The `transform()` function produces an array by applying a function to each element of the input array.

In [13]:
# Calculate Fahrenheit from Celsius for an array of temperatures
spark.sql("""
SELECT celsius,
transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit
FROM tC
""").show()


+--------------------+--------------------+
|             celsius|          fahrenheit|
+--------------------+--------------------+
|[35, 36, 32, 30, ...|[95, 96, 89, 86, ...|
|[31, 32, 34, 55, 56]|[87, 89, 93, 131,...|
+--------------------+--------------------+



# *filter()*
The function `filter()` produces an array consisting of only the elements of the input array for which the Boolean funciton is `true`.

In [14]:
# Filter all temperatures > 38C for an array of temperatures

spark.sql("""
SELECT celsius,
filter(celsius, t -> t > 38) as high
FROM tC
""").show()


+--------------------+--------+
|             celsius|    high|
+--------------------+--------+
|[35, 36, 32, 30, ...|[40, 42]|
|[31, 32, 34, 55, 56]|[55, 56]|
+--------------------+--------+



# *exists()*
The function `exists()` returns `true` if the boolean function holds for any element in the input array.

In [15]:
spark.sql("""
SELECT celsius,
       exists (celsius, t -> t = 38) as threshold
FROM tC
""").show()


+--------------------+---------+
|             celsius|threshold|
+--------------------+---------+
|[35, 36, 32, 30, ...|     true|
|[31, 32, 34, 55, 56]|    false|
+--------------------+---------+



# *reduce()*
The `reduce()` function reduces the elements of an array to a single value.

In [23]:
# Calculate the average temperature of a row and convert it to Fahrenheit
spark.sql("""
SELECT celsius,
reduce(
           clsius,
           0,
           (t, acc) -> t + acc,
           acc -> (acc div size(celsius) * 9 div 5) + 32
) as avgFahrenheit
FROM tC
""").show()


AnalysisException: Undefined function: 'reduce'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 3 pos 0

In [21]:
spark.sql("SELECT reduce(array(1, 2, 3), 0, (acc, x) -> acc + x)").show()


AnalysisException: Undefined function: 'reduce'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7