# Chapter 5 Higher-Order Functions
Christoph Windheuser    
May, 2022   
Python examples of chapter 5 Higher-Order Functions (page 141 ff) in the book *Learning Spark*

In [1]:
# Import required python spark libraries
import findspark
import pyspark

from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from pyspark.sql.types import *
from pyspark.sql.functions import *
# from pyspark.sql.functions import col, expr, when, concat, lit, avg, desc
from pyspark.sql import SparkSession
from pyspark.sql import Row


In [None]:
# Only use when running a Jupyter Notebook, don't use it when starting with pyspark!
# Connect Jupyter Notebook with the Spark application and create Spark Context
findspark.init()
sc = pyspark.SparkContext(appName="chapter_4_HOF")


In [2]:
#create a SparkSession

spark = (SparkSession \
         .builder \
         .enableHiveSupport() \
         .config("spark.sql.catalogImplementation","hive") \
         .appName("Chapter_4_HOF") \
         .getOrCreate())


# Define a DataFrame

In [3]:
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])


In [4]:
t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]


In [5]:
t_c = spark.createDataFrame(t_list, schema)

In [6]:
t_c.createOrReplaceTempView("tC")

In [7]:
t_c.show()

+--------------------+
|             celsius|
+--------------------+
|[35, 36, 32, 30, ...|
|[31, 32, 34, 55, 56]|
+--------------------+



# Using the Higher-Order Function *transform()*
The `transform()` function produces an array by applying a function to each element of the input array.

In [8]:
# Calculate Fahrenheit from Celsius for an array of temperatures
spark.sql("""
SELECT celsius,
transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit
FROM tC
""").show()


+--------------------+--------------------+
|             celsius|          fahrenheit|
+--------------------+--------------------+
|[35, 36, 32, 30, ...|[95, 96, 89, 86, ...|
|[31, 32, 34, 55, 56]|[87, 89, 93, 131,...|
+--------------------+--------------------+



# *filter()*
The function `filter()` produces an array consisting of only the elements of the input array for which the Boolean funciton is `true`.

In [9]:
# Filter all temperatures > 38C for an array of temperatures

spark.sql("""
SELECT celsius,
filter(celsius, t -> t > 38) as high
FROM tC
""").show()


+--------------------+--------+
|             celsius|    high|
+--------------------+--------+
|[35, 36, 32, 30, ...|[40, 42]|
|[31, 32, 34, 55, 56]|[55, 56]|
+--------------------+--------+



# *exists()*
The function `exists()` returns `true` if the boolean function holds for any element in the input array.

In [10]:
spark.sql("""
SELECT celsius,
       exists (celsius, t -> t = 38) as threshold
FROM tC
""").show()


+--------------------+---------+
|             celsius|threshold|
+--------------------+---------+
|[35, 36, 32, 30, ...|     true|
|[31, 32, 34, 55, 56]|    false|
+--------------------+---------+



# *aggregate()*
In the book, the function `reduce()` is used, but throws an error. The function `aggregate()` provides the correct result.

In [11]:
# Calculate the average temperature of a row and convert it to Fahrenheit
spark.sql("""
SELECT celsius,
    aggregate(
        celsius,
        0,
        (t, acc) -> t + acc,
        acc -> (acc div size(celsius) * 9 div 5) + 32
        ) as avgFahrenheit
FROM tC
""").show()


+--------------------+-------------+
|             celsius|avgFahrenheit|
+--------------------+-------------+
|[35, 36, 32, 30, ...|           96|
|[31, 32, 34, 55, 56]|          105|
+--------------------+-------------+



# Common DataFrames and Spark SQL Operations
Page 144 ff

In [3]:
from pyspark.sql.functions import expr

In [4]:
# Define the file paths
tripdelaysFilePath = """../DB_Spark/LearningSparkV2/databricks-datasets/\
learning-spark-v2/flights/departuredelays.csv"""

airportsnaFilePath = """../DB_Spark/LearningSparkV2/databricks-datasets/\
learning-spark-v2/flights/airport-codes-na.txt"""


In [5]:
# Obtain airport data set
airportsna = (spark.read
             .format("csv")
             .options(header="true", inferSchema="true", sep="\t")
             .load(airportsnaFilePath))


In [6]:
# Create a temporary view from the DataFrame
airportsna.createOrReplaceTempView("airports_na")


In [7]:
# Obtain departure delays data set
departureDelays = (spark.read
                  .format("csv")
                  .options(header="true")
                  .load(tripdelaysFilePath))


In [8]:
# Cast delay and distance to INT
departureDelays =(departureDelays
                 .withColumn("delay", expr("CAST(delay as INT) as delay"))
                 .withColumn("distance", expr("CAST(distance as INT) as distance")))


In [9]:
# Create a temporary view from the DataFrame
departureDelays.createOrReplaceTempView("departureDelays")

In [10]:
# Create temporary small table
foo = (departureDelays
      .filter(expr("""origin == 'SEA' and destination == 'SFO' and
      date like '01010%' and delay > 0""")))

foo.createOrReplaceTempView("foo")


In [11]:
# Show the table airports_na
spark.sql("SELECT * FROM airports_na LIMIT 10").show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Abbotsford|   BC| Canada| YXX|
|   Aberdeen|   SD|    USA| ABR|
|    Abilene|   TX|    USA| ABI|
|      Akron|   OH|    USA| CAK|
|    Alamosa|   CO|    USA| ALS|
|     Albany|   GA|    USA| ABY|
|     Albany|   NY|    USA| ALB|
|Albuquerque|   NM|    USA| ABQ|
| Alexandria|   LA|    USA| AEX|
|  Allentown|   PA|    USA| ABE|
+-----------+-----+-------+----+



In [12]:
# Show the table departureDelays
spark.sql("SELECT * FROM departureDelays LIMIT 10").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+



In [13]:
# Show the table foo
spark.sql("SELECT * FROM foo").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



## Unions
Page 147 ff

In [14]:
# Union the two tables "deparetureDelays" and "foo" and creating a new table "bar"
bar = departureDelays.union(foo)
bar.createOrReplaceTempView("bar")


In [15]:
# Filter for SEA and SFO in the specific time range - the same filter criteria as for foo.
# This will result in 2x the entries of "foo", as these entries are part of the tables
# "departureDelays" and "foo"
bar.filter(expr("""origin == 'SEA' AND destination == 'SFO'
                    AND date LIKE '01010%' AND delay > 0""")).show()


+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



## Joins
Page 148

By default, a Spark SQL join is an inner join, but other options are possible (see documentation).

The following code performs an inner joins between the two DataFrames `airportsna` and `foo`.

In [16]:
foo.join(
    airportsna,
    airportsna.IATA == foo.origin
    ).select("City", "State", "date", "delay", "distance", "destination").show()


+-------+-----+--------+-----+--------+-----------+
|   City|State|    date|delay|distance|destination|
+-------+-----+--------+-----+--------+-----------+
|Seattle|   WA|01010710|   31|     590|        SFO|
|Seattle|   WA|01010955|  104|     590|        SFO|
|Seattle|   WA|01010730|    5|     590|        SFO|
+-------+-----+--------+-----+--------+-----------+



## Windowing

In [37]:
spark.sql("DROP TABLE IF EXISTS departureDelaysWindow")

DataFrame[]

In [38]:
spark.sql("""
    CREATE TABLE departureDelaysWindow AS \
    SELECT origin, destination, SUM(delay) AS TotalDelays \
    FROM   departureDelays \
    WHERE  origin IN('SEA','SFO','JFK') \
    AND    destination IN('SEA','SFO','JFK','DEN','ORD','LAX','ATL') \
    GROUP  BY origin, destination;
""")

DataFrame[]

In [39]:
spark.sql("SELECT * FROM departureDelaysWindow").show()

+------+-----------+-----------+
|origin|destination|TotalDelays|
+------+-----------+-----------+
|   JFK|        ORD|       5608|
|   JFK|        SFO|      35619|
|   JFK|        DEN|       4315|
|   JFK|        ATL|      12141|
|   JFK|        SEA|       7856|
|   JFK|        LAX|      35755|
|   SEA|        LAX|       9359|
|   SFO|        ORD|      27412|
|   SFO|        DEN|      18688|
|   SFO|        SEA|      17080|
|   SEA|        SFO|      22293|
|   SFO|        ATL|       5091|
|   SEA|        DEN|      13645|
|   SEA|        ATL|       4535|
|   SEA|        ORD|      10041|
|   SFO|        JFK|      24100|
|   SFO|        LAX|      40798|
|   SEA|        JFK|       4667|
+------+-----------+-----------+



Using the window function `dense_ran()` to caluclate the tree top delayed destinations per origin airport:

In [40]:
spark.sql("""
SELECT origin, destination, TotalDelays, rank
FROM (
    SELECT origin, destination, TotalDelays, dense_rank()
    OVER (PARTITION BY origin ORDER BY TotalDelays DESC) as rank
    FROM departureDelaysWindow
    ) t
    WHERE rank <= 3
""").show()


+------+-----------+-----------+----+
|origin|destination|TotalDelays|rank|
+------+-----------+-----------+----+
|   JFK|        LAX|      35755|   1|
|   JFK|        SFO|      35619|   2|
|   JFK|        ATL|      12141|   3|
|   SEA|        SFO|      22293|   1|
|   SEA|        DEN|      13645|   2|
|   SEA|        ORD|      10041|   3|
|   SFO|        LAX|      40798|   1|
|   SFO|        ORD|      27412|   2|
|   SFO|        JFK|      24100|   3|
+------+-----------+-----------+----+

