# Chapter 2, Spark: Definitive Guide: Big Data processing Made Simple
Second half of chapter - flight data summary


In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()


In [2]:
flightData2015 = spark.read.option("inferSchema", True).option("header", True).csv("/opt/workspace/data/2015-summary.csv")

In [3]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [4]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(1) Sort [count#18 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#18 ASC NULLS FIRST, 200), true, [id=#32]
   +- FileScan csv [DEST_COUNTRY_NAME#16,ORIGIN_COUNTRY_NAME#17,count#18] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/opt/workspace/data/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




Note that the *sort* operation requires a scan of all partitions - partition exchange - or a "shuffle" operation.

In [5]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [6]:
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

Create a Table or View from a DataFrame:

In [7]:
flightData2015.createOrReplaceTempView("flight_data_2015")

Use Spark SQL to query View - Spark SQL returns another DataFrame

In [8]:
results = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

In [9]:
results.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#16, 5), true, [id=#61]
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[partial_count(1)])
      +- FileScan csv [DEST_COUNTRY_NAME#16] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/opt/workspace/data/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




In [10]:
dataframe_results = flightData2015.groupBy("DEST_COUNTRY_NAME").count()

In [11]:
dataframe_results.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#16, 5), true, [id=#80]
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[partial_count(1)])
      +- FileScan csv [DEST_COUNTRY_NAME#16] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/opt/workspace/data/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




## Basic Data Analysis in SparkSQL ##

In [12]:
# Get max flight destination from 2015 flights
spark.sql("SELECT max(count) from flight_data_2015").take(1)

[Row(max(count)=370002)]

In [13]:
# Pyspark / Python equiv
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(5)

[Row(max(count)=370002)]

In [14]:
# Find top 5 destinations
top_5_dest = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

In [15]:
top_5_dest.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [16]:
# Find top 5 destinations - DataFrame syntax
from pyspark.sql.functions import desc
flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count")\
    .withColumnRenamed("sum(count)", "destination_total").sort(desc("destination_total"))\
    .limit(5).show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



#### Show Execution Path
Aggregation happens in two parts - in the partial_sum and sum calls - this is because summing a list of numbers is commutative* and Spark can perform the sum partition-by-partition.

\* commutative - " condition that a group of quantities connected by operators gives the same result whatever the order of the quantities involved, e.g. a × b = b × a."


In [17]:
flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count")\
    .withColumnRenamed("sum(count)", "destination_total").sort(desc("destination_total"))\
    .limit(5).explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#104L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#16,destination_total#104L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[sum(cast(count#18 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#16, 5), true, [id=#227]
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[partial_sum(cast(count#18 as bigint))])
         +- FileScan csv [DEST_COUNTRY_NAME#16,count#18] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/opt/workspace/data/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


