In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.remote("sc://192.168.2.20:15002").getOrCreate()

In [2]:
flightData2015 = spark\
 .read\
 .option("inferSchema", "true")\
 .option("header", "true")\
 .csv("datasets/2015-summary.csv")

In [3]:
flightData2015.sort("count").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#20 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#20 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=27]
      +- FileScan csv [DEST_COUNTRY_NAME#18,ORIGIN_COUNTRY_NAME#19,count#20] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://node-master:9000/user/hadoop/datasets/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [4]:
spark.conf.set("spark.sql.shuffle.partitions", "5")
flightData2015.sort("count").explain()

df2 = flightData2015.sort("count").take(2)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#48 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#48 ASC NULLS FIRST, 5), ENSURE_REQUIREMENTS, [plan_id=56]
      +- FileScan csv [DEST_COUNTRY_NAME#46,ORIGIN_COUNTRY_NAME#47,count#48] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://node-master:9000/user/hadoop/datasets/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [5]:
flightData2015.createOrReplaceTempView("flight_data_2015")

sqlWay = spark.sql("""
    SELECT DEST_COUNTRY_NAME, count(1)
    FROM flight_data_2015
    GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .count()

sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#105], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#105, 5), ENSURE_REQUIREMENTS, [plan_id=129]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#105], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#105] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://node-master:9000/user/hadoop/datasets/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#143], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#143, 5), ENSURE_REQUIREMENTS, [plan_id=161]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#143], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#143] Batched: false, DataFilters: [], Format: CSV, Location: InMemor

In [6]:
spark.sql("SELECT max(count) from flight_data_2015").take(1)

[Row(max(count)=370002)]

In [7]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [8]:
maxSql = spark.sql("""
    SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
    FROM flight_data_2015
    GROUP BY DEST_COUNTRY_NAME
    ORDER BY sum(count) DESC
    LIMIT 5
""")

maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [9]:
from pyspark.sql.functions import desc

flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [10]:
flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .explain()
    
spark.sql("""
    SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
    FROM flight_data_2015
    GROUP BY DEST_COUNTRY_NAME
    ORDER BY sum(count) DESC
    LIMIT 5
""").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#369L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#358,destination_total#369L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#358], functions=[sum(count#360)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#358, 5), ENSURE_REQUIREMENTS, [plan_id=462]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#358], functions=[partial_sum(count#360)])
            +- FileScan csv [DEST_COUNTRY_NAME#358,count#360] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://node-master:9000/user/hadoop/datasets/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#385L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#105,destination_total#385L])
   +- HashAggregate(keys=[DEST_COUNT

In [12]:
dfToWrite = flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)
    
dfToWrite.write\
    .format("json")\
    .mode("overwrite")\
    .save("datasets/2015-summary.json")
