# Chapter 4
Christoph Windheuser    
April 15, 2022   
Python examples of chapter 3 in the book *Learning Spark*



In [9]:
# Import required python spark libraries
import findspark
import pyspark

from pyspark.sql.types import *
from pyspark.sql.functions import col, expr, when, concat, lit, avg, desc
from pyspark.sql import SparkSession
from pyspark.sql import Row


In [2]:
# Connect Jupyter Notebook with the Spark application and create Spark Context
findspark.init()
sc = pyspark.SparkContext(appName="chapter_4")


In [3]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("Chapter_4_Examples")
       .getOrCreate())


In [4]:
csv_file = "data/departuredelays.csv"

df = (spark.read.format("csv")
      .option("inferSchema", "true")
      .option("header", "true")
      .load(csv_file))

df.createOrReplaceTempView("us_delay_flights_tbl")


In [5]:
df.show(5)

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
+-------+-----+--------+------+-----------+
only showing top 5 rows



Show flights with a distance of > 1000 miles and order the results by descendent distance. Show the first 10 results of this list:

In [6]:
spark.sql("""SELECT distance, origin, destination
          FROM us_delay_flights_tbl WHERE distance > 1000
          ORDER BY distance DESC""").show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



Instead of spark.sql, the same querry can be executed with the DataFrame API and shows the same result: 

In [16]:
(df.select("distance", "origin", "destination")
   .where("distance > 1000")
   .orderBy("distance", ascending = False).show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



Find all flights between San Francisco (SFO) and Chicago (ORD) with at least a two-hour delay:

In [7]:
spark.sql("""SELECT date, delay, origin, destination
          FROM us_delay_flights_tbl
          WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD'
          ORDER by delay DESC""").show(10)

+-------+-----+------+-----------+
|   date|delay|origin|destination|
+-------+-----+------+-----------+
|2190925| 1638|   SFO|        ORD|
|1031755|  396|   SFO|        ORD|
|1022330|  326|   SFO|        ORD|
|1051205|  320|   SFO|        ORD|
|1190925|  297|   SFO|        ORD|
|2171115|  296|   SFO|        ORD|
|1071040|  279|   SFO|        ORD|
|1051550|  274|   SFO|        ORD|
|3120730|  266|   SFO|        ORD|
|1261104|  258|   SFO|        ORD|
+-------+-----+------+-----------+
only showing top 10 rows



Label the flights based on the delays they have experienced. Add a human-readable new column called 'Flight_Delays' containing the labels to the table:

In [8]:
spark.sql("""SELECT delay, origin, destination,
          CASE
              WHEN delay > 360 THEN 'Very Long Delays'
              WHEN delay > 120 AND delay < 360 THEN 'Long Delay'
              WHEN delay > 60 AND delay < 120 THEN 'Short Delay'
              WHEN delay > 0 AND delay < 60 THEN 'Tolerable Delay'
              WHEN delay = 0 THEN 'No Delay'
              ELSE 'Early'
         END AS Flight_Delays
         FROM us_delay_flights_tbl
         ORDER BY origin, delay DESC""").show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|   Long Delay|
|  305|   ABE|        ATL|   Long Delay|
|  275|   ABE|        ATL|   Long Delay|
|  257|   ABE|        ATL|   Long Delay|
|  247|   ABE|        ATL|   Long Delay|
|  247|   ABE|        DTW|   Long Delay|
|  219|   ABE|        ORD|   Long Delay|
|  211|   ABE|        ATL|   Long Delay|
|  197|   ABE|        DTW|   Long Delay|
|  192|   ABE|        ORD|   Long Delay|
+-----+------+-----------+-------------+
only showing top 10 rows

