#### Prepare Data

In [2]:
# Set File Paths
tripdelaysFilePath = "/databricks-datasets/flights/departuredelays.csv"
airportsnaFilePath = "/databricks-datasets/flights/airport-codes-na.txt"

# Obtain airports dataset
airportsna = spark.read.format("com.databricks.spark.csv").options(header='true', inferschema='true', delimiter='\t').load(airportsnaFilePath)
airportsna.createOrReplaceTempView("airports_na")

# Obtain departure Delays data
departureDelays = sqlContext.read.format("com.databricks.spark.csv").options(header='true').load(tripdelaysFilePath)
departureDelays.createOrReplaceTempView("departureDelays")
departureDelays.cache()

# Available IATA codes from the departuredelays sample dataset
tripIATA = sqlContext.sql("select distinct iata from (select distinct origin as iata from departureDelays union all select distinct destination as iata from departureDelays) a")
tripIATA.createOrReplaceTempView("tripIATA")

# Only include airports with atleast one trip from the departureDelays dataset
airports = sqlContext.sql("select f.IATA, f.City, f.State, f.Country from airports_na f join tripIATA t on t.IATA = f.IATA")
airports.createOrReplaceTempView("airports")
airports.cache()

#### Correct DataFrame Schema
Convert `distance` and `delay` columns to `long` datatype

In [4]:
# Print out `departureDelays` schema
departureDelays.printSchema()

In [5]:
# Create `delays` table with correct schema
delays = departureDelays.select('date', 'origin', 'destination', departureDelays.distance.cast('long').alias('distance'), departureDelays.delay.cast('long').alias('delay'))
delays.createOrReplaceTempView("delays")

In [6]:
# Validate schema
#    Note that `distance` and `delay` are long
delays.printSchema()

#### Use sort_array and collect_set
Use `sort_array` and `collect_set`

In [8]:
# Generate two row `df` DataFrame 
df = spark.sql("select origin, destination, sort_array(collect_set(delay)) as collected from delays where origin = 'SEA' and destination = 'SFO' and date like '010107%' group by origin, destination")

In [9]:
# Note array is `long`
df.printSchema()

In [10]:
# Display DataFrame where `collected` is sorted integer wise
df.show()