In [1]:
import os
import pandas as pd
import numpy as np

# Create Spark context
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('dj').setMaster('local[*]')
sc = SparkContext(conf=conf)

# Create Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dj').getOrCreate()
spark
(sc.applicationId, sc.master, sc.version, sc.uiWebUrl, sc.defaultParallelism, sc.pythonVer, sc.appName, sc.sparkUser)

('local-1698603390093',
 'local[*]',
 '3.5.0',
 'http://USHYDJDAMODH6.us.deloitte.com:4041',
 16,
 '3.11',
 'dj',
 <bound method SparkContext.sparkUser of <SparkContext master=local[*] appName=dj>>)

In [2]:
import time

start_time = time.time()
departures_df = spark.read.csv("./AA_DFW_2017_Departures_Short.csv.gz", header=True)
# Add caching to the unique rows in departures_df
departures_df = departures_df.distinct().cache()

# Count the unique rows in departures_df, noting how long the operation takes
print("Counting %d rows took %f seconds" % (departures_df.count(), time.time() - start_time))

# Count the rows again, noting the variance in time of a cached DataFrame
start_time = time.time()
print("Counting %d rows again took %f seconds" % (departures_df.count(), time.time() - start_time))

Counting 139358 rows took 16.367500 seconds
Counting 139358 rows again took 0.214770 seconds


In [3]:
# Determine if departures_df is in the cache
print("Is departures_df cached?: %s" % departures_df.is_cached)
print("Removing departures_df from cache")

# Remove departures_df from the cache
departures_df.unpersist()

# Check the cache status again
print("Is departures_df cached?: %s" % departures_df.is_cached)

Is departures_df cached?: True
Removing departures_df from cache
Is departures_df cached?: False


In [4]:
# Import the full and split files into DataFrames
full_df = spark.read.csv('./AA_DFW_2014_Departures_Short.csv.gz')
split_df = spark.read.csv('./AA*gz')

# Print the count and run time for each DataFrame
start_time_a = time.time()
print("Total rows in full DataFrame:\t%d" % full_df.count())
print("Time to run: %f" % (time.time() - start_time_a))

start_time_b = time.time()
print("Total rows in split DataFrame:\t%d" % split_df.count())
print("Time to run: %f" % (time.time() - start_time_b))

Total rows in full DataFrame:	157199
Time to run: 0.276433
Total rows in split DataFrame:	583722
Time to run: 0.327415


In [5]:
# Name of the Spark application instance
app_name = spark.conf.get('spark.app.name')

# Driver TCP port
driver_tcp_port = spark.conf.get('spark.driver.port')

# Number of join partitions
num_partitions = spark.conf.get('spark.sql.shuffle.partitions')

# Show the results
print("Name: %s" % app_name)
print("Driver TCP port: %s" % driver_tcp_port)
print("Number of partitions: %s" % num_partitions)

Name: dj
Driver TCP port: 59470
Number of partitions: 200


In [6]:
# Store the number of partitions in variable
before = departures_df.rdd.getNumPartitions()

# Configure Spark to use 500 partitions
spark.conf.set('spark.sql.shuffle.partitions', 500)

# Recreate the DataFrame using the departures data file
departures_df = spark.read.csv('./AA_DFW_2015_Departures_Short.csv.gz').distinct()

# Print the number of partitions for each instance
print("Partition count before change: %d" % before)
print("Partition count after change: %d" % departures_df.rdd.getNumPartitions())

Partition count before change: 2
Partition count after change: 2


https://openflights.org/data.php

In [7]:
airports_df = spark.read.csv('./airports.csv', header=True)


In [8]:
# Join the flights_df and aiports_df DataFrames
normal_df = flights_df.join(
    airports_df,
    flights_df["Destination Airport"] == airports_df["IATA"]
)

# Show the query plan   
normal_df.explain()

NameError: name 'flights_df' is not defined

In [None]:
normal_df.show(3,False)

+-----------------+-------------+-------------------+---------------+----+-------------------------------------+-------------+-------------+----+----+-----------------+-----------+--------+--------+---+---------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|flight_duration|id  |Name                                 |City'        | 'Country    |IATA|ICAO|Latitude         |Longitude  |Altitude|Timezone|DST|Tz database time zone|
+-----------------+-------------+-------------------+---------------+----+-------------------------------------+-------------+-------------+----+----+-----------------+-----------+--------+--------+---+---------------------+
|01/01/2017       |0005         |HNL                |537            |3728|Daniel K Inouye International Airport|Honolulu     |United States|HNL |PHNL|21.32062         |-157.924228|13      |-10     |N  |Pacific/Honolulu     |
|01/01/2017       |0007         |OGG                |498            |3456|Kahului Airport           

In [None]:
# Import the broadcast method from pyspark.sql.functions
from pyspark.sql.functions import broadcast

# Join the flights_df and aiports_df DataFrames using broadcasting
broadcast_df = flights_df.join(
    broadcast(airports_df),
    flights_df["Destination Airport"] == airports_df["IATA"]
)

# Show the query plan and compare against the original
broadcast_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [Destination Airport#76], [IATA#548], Inner, BuildRight, false
   :- Union
   :  :- Project [Date (MM/DD/YYYY)#74, Flight Number#75, Destination Airport#76, Actual elapsed time (Minutes)#77 AS flight_duration#158]
   :  :  +- Filter isnotnull(Destination Airport#76)
   :  :     +- FileScan csv [Date (MM/DD/YYYY)#74,Flight Number#75,Destination Airport#76,Actual elapsed time (Minutes)#77] Batched: false, DataFilters: [isnotnull(Destination Airport#76)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/jdamodhar/Desktop/python_essential/DataCamp-ds-deloitte..., PartitionFilters: [], PushedFilters: [IsNotNull(Destination Airport)], ReadSchema: struct<Date (MM/DD/YYYY):string,Flight Number:string,Destination Airport:string,Actual elapsed ti...
   :  +- Project [Date (MM/DD/YYYY)#99, Flight Number#100, Destination Airport#101, Actual elapsed time (Minutes)#102 AS flight_duration#720]
   :     +- Fil

In [None]:
start_time = time.time()
# Count the number of rows in the normal DataFrame
normal_count = normal_df.count()
normal_duration = time.time() - start_time

start_time = time.time()
# Count the number of rows in the broadcast DataFrame
broadcast_count = broadcast_df.count()
broadcast_duration = time.time() - start_time

# Print the counts and the duration of the tests
print("Normal count:\t\t%d\tduration: %f" % (normal_count, normal_duration))
print("Broadcast count:\t%d\tduration: %f" % (broadcast_count, broadcast_duration))

Normal count:		279962	duration: 0.640231
Broadcast count:	279962	duration: 0.415035
