# Chapter 2 - A gentle introduction

You control your spark applications via the 'SparkSession', who is the one responsible for getting user defined code accross the cluster.

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("def-guide").getOrCreate()

In [2]:
myRange = spark.range(1000).toDF("number")
myRange

DataFrame[number: bigint]

In [3]:
divisBy2 = myRange.where("number % 2 == 0")
divisBy2

DataFrame[number: bigint]

In [4]:
print("Executing an action on myRange: DF has {} records".format(myRange.count()))
print("Executing an action on divisBy2: DF has {} records".format(divisBy2.count()))

Executing an action on myRange: DF has 1000 records
Executing an action on divisBy2: DF has 500 records


In [5]:
flightData2015 = spark.read.option("inferSchema", "true").option("header", "true").csv("../data/flight-data/csv/2015-summary.csv")
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [6]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(2) Sort [count#29 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#29 ASC NULLS FIRST, 200)
   +- *(1) FileScan csv [DEST_COUNTRY_NAME#27,ORIGIN_COUNTRY_NAME#28,count#29] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/andreu/Documents/Projects/Spark-The-Definitive-Guide/data/flight-da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [7]:
spark.conf.set("spark.sql.shuffle.partitions", "5")
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

## Interfacing with SQL

In [8]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [13]:
sqlWay = spark.sql("SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME")
sqlWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#27, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#27] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/andreu/Documents/Projects/Spark-The-Definitive-Guide/data/flight-da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>
== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#27, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#27] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/andreu/Documents/Projects/Spark-The-Definitive-Guide/data/flight-da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAM

In [16]:
dataFrameWay = flightData2015.groupby("DEST_COUNTRY_NAME").count()
dataFrameWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#27, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#27] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/andreu/Documents/Projects/Spark-The-Definitive-Guide/data/flight-da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [14]:
spark.sql("SELECT max(count) FROM flight_data_2015").take(1)

[Row(max(count)=370002)]

In [15]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [17]:
flightData2015.describe()

DataFrame[summary: string, DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: string]

In [18]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, SUM(count) AS destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")
maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [25]:
from pyspark.sql.functions import desc

flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count")\
                                           .withColumnRenamed("sum(count)", "destination_total")\
                                           .sort(desc("destination_total"))\
                                           .limit(5).show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+

