# Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, row_number
from pyspark.sql.window import Window

spark = (SparkSession.builder.appName("cs544")
         .master("spark://main:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://main:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/27 13:54:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# ! wget https://pages.cs.wisc.edu/~harter/cs639/data/sf.zip
# ! unzip sf.zip

In [3]:
!hdfs dfs -cp sf.csv hdfs://main:9000/sf.csv

In [4]:
df = (spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load("hdfs://main:9000/sf.csv"))
columns = [col(c).alias(c.replace(" ", "")) for c in df.columns]
df.select(columns).write.format("parquet").mode("overwrite").save("hdfs://main:9000/sf.parquet")

23/03/27 13:55:48 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
df = spark.read.format("parquet").load("hdfs://main:9000/sf.parquet")
df.createOrReplaceTempView("calls")

# Demos

### Windowing

In [6]:
# what are the 3 biggest call types, in each call type group
spark.sql("""
SELECT CallTypeGroup, CallType, COUNT(*) as count
FROM calls
GROUP BY CallTypeGroup, CallType
""").toPandas()

                                                                                

Unnamed: 0,CallTypeGroup,CallType,count
0,,Elevator / Escalator Rescue,7062
1,,Marine Fire,222
2,,Aircraft Emergency,1263
3,,Confined Space / Structure Collapse,286
4,,Administrative,306
...,...,...,...
73,Alarm,Vehicle Fire,13
74,Fire,Administrative,1
75,Alarm,Oil Spill,5
76,Alarm,Aircraft Emergency,63


In [7]:
spark.sql("""
SELECT CallTypeGroup, CallType, count,
       row_number() OVER (PARTITION BY CallTypeGroup ORDER BY count DESC)
FROM
(
    SELECT CallTypeGroup, CallType, COUNT(*) as count
    FROM calls
    GROUP BY CallTypeGroup, CallType
)
""").toPandas()

                                                                                

Unnamed: 0,CallTypeGroup,CallType,count,row_number() OVER (PARTITION BY CallTypeGroup ORDER BY count DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
0,,Medical Incident,1783402,1
1,,Structure Fire,416250,2
2,,Alarms,298422,3
3,,Traffic Collision,107136,4
4,,Other,50967,5
...,...,...,...,...
73,Potentially Life-Threatening,Medical Incident,1448636,1
74,Potentially Life-Threatening,Traffic Collision,86830,2
75,Potentially Life-Threatening,Other,1763,3
76,Potentially Life-Threatening,Water Rescue,93,4


In [8]:
spark.sql("""
SELECT CallTypeGroup, CallType, count,
       row_number() OVER (PARTITION BY CallTypeGroup ORDER BY count DESC) AS num
FROM
(
    SELECT CallTypeGroup, CallType, COUNT(*) as count
    FROM calls
    GROUP BY CallTypeGroup, CallType
)
""").where("num <= 3").toPandas()

                                                                                

Unnamed: 0,CallTypeGroup,CallType,count,num
0,,Medical Incident,1783402,1
1,,Structure Fire,416250,2
2,,Alarms,298422,3
3,Alarm,Alarms,366116,1
4,Alarm,Structure Fire,245241,2
5,Alarm,Citizen Assist / Service Call,43403,3
6,Fire,Structure Fire,51061,1
7,Fire,Outside Fire,40752,2
8,Fire,Water Rescue,22007,3
9,Non Life-threatening,Medical Incident,716653,1


### Joining

In [9]:
! wget https://raw.githubusercontent.com/tylerharter/us-federal-holidays/main/holidays2.csv
! hdfs dfs -cp holidays2.csv hdfs://main:9000/
holidays = (spark.read.format("csv").option("header", True)
            .load("hdfs://main:9000/holidays2.csv"))

--2023-03-27 13:59:19--  https://raw.githubusercontent.com/tylerharter/us-federal-holidays/main/holidays2.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3495 (3.4K) [text/plain]
Saving to: ‘holidays2.csv.2’


2023-03-27 13:59:20 (41.5 MB/s) - ‘holidays2.csv.2’ saved [3495/3495]



In [10]:
holidays.toPandas()

Unnamed: 0,date,holiday
0,01/01/2013,New Year's Day
1,01/01/2014,New Year's Day
2,01/01/2015,New Year's Day
3,01/01/2016,New Year's Day
4,01/01/2018,New Year's Day
...,...,...
117,12/25/2020,Christmas Day
118,12/26/2011,Christmas Day
119,12/26/2016,Christmas Day
120,12/26/2022,Christmas Day


In [11]:
calls = spark.table("calls")

In [12]:
calls["CallDate"] == holidays["date"]

Column<'(CallDate = date)'>

In [13]:
# can we associate a holiday with a call?
# BY DEFAULT: inner join
calls.join(holidays, on=calls["CallDate"] == holidays["date"]).limit(10).toPandas()

                                                                                

Unnamed: 0,CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,...,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhooods-AnalysisBoundaries,RowID,case_location,AnalysisNeighborhoods,date,holiday
0,201461772,94,20060706,Medical Incident,05/25/2020,05/25/2020,05/25/2020 01:34:55 PM,05/25/2020 01:34:55 PM,05/25/2020 01:36:14 PM,05/25/2020 01:36:19 PM,...,MEDIC,1,10,10,Bayview Hunters Point,201461772-94,POINT (-122.38626545834089 37.73782577837169),1,05/25/2020,Memorial Day
1,201463874,E32,20060913,Medical Incident,05/25/2020,05/25/2020,05/25/2020 11:16:21 PM,05/25/2020 11:17:54 PM,05/25/2020 11:18:27 PM,05/25/2020 11:19:53 PM,...,ENGINE,1,9,11,Excelsior,201463874-E32,POINT (-122.42955213038633 37.729273863971024),7,05/25/2020,Memorial Day
2,201463019,T18,20060826,Medical Incident,05/25/2020,05/25/2020,05/25/2020 06:39:44 PM,05/25/2020 06:41:34 PM,05/25/2020 06:55:28 PM,05/25/2020 06:57:32 PM,...,TRUCK,2,8,7,Inner Sunset,201463019-T18,POINT (-122.46829612745225 37.751748874988515),14,05/25/2020,Memorial Day
3,201460900,E11,20060611,Medical Incident,05/25/2020,05/25/2020,05/25/2020 09:22:49 AM,05/25/2020 09:24:51 AM,05/25/2020 09:25:17 AM,05/25/2020 09:26:11 AM,...,ENGINE,1,6,8,Mission,201460900-E11,POINT (-122.42413130758577 37.752703146369065),20,05/25/2020,Memorial Day
4,201463227,B03,20060859,Alarms,05/25/2020,05/25/2020,05/25/2020 07:40:19 PM,05/25/2020 07:41:33 PM,05/25/2020 07:41:44 PM,05/25/2020 07:42:33 PM,...,CHIEF,2,3,6,Financial District/South Beach,201463227-B03,POINT (-122.40169229261626 37.78658885907634),8,05/25/2020,Memorial Day
5,201460714,E26,20060590,Medical Incident,05/25/2020,05/25/2020,05/25/2020 08:23:14 AM,05/25/2020 08:23:14 AM,05/25/2020 08:31:53 AM,05/25/2020 08:34:00 AM,...,ENGINE,2,6,8,Glen Park,201460714-E26,POINT (-122.4396375225712 37.73654095477352),10,05/25/2020,Memorial Day
6,201463595,RC1,20060892,Medical Incident,05/25/2020,05/25/2020,05/25/2020 09:24:50 PM,05/25/2020 09:26:27 PM,05/25/2020 09:27:04 PM,05/25/2020 09:27:44 PM,...,RESCUE CAPTAIN,1,3,6,Financial District/South Beach,201463595-RC1,POINT (-122.39638047787408 37.787304005252835),8,05/25/2020,Memorial Day
7,201463264,54,20060863,Medical Incident,05/25/2020,05/25/2020,05/25/2020 07:52:29 PM,05/25/2020 07:54:40 PM,05/25/2020 07:54:56 PM,05/25/2020 07:55:02 PM,...,MEDIC,3,10,10,Potrero Hill,201463264-54,POINT (-122.39724892486004 37.759967407690915),26,05/25/2020,Memorial Day
8,201463403,56,20060881,Medical Incident,05/25/2020,05/25/2020,05/25/2020 08:35:42 PM,05/25/2020 08:36:38 PM,05/25/2020 08:36:55 PM,05/25/2020 08:37:01 PM,...,MEDIC,2,1,3,Nob Hill,201463403-56,POINT (-122.4083891465028 37.79026526175803),21,05/25/2020,Memorial Day
9,201463254,63,20060862,Medical Incident,05/25/2020,05/25/2020,05/25/2020 07:52:16 PM,05/25/2020 07:52:16 PM,05/25/2020 07:52:43 PM,05/25/2020 07:52:47 PM,...,MEDIC,2,3,10,Mission Bay,201463254-63,POINT (-122.39549294787926 37.76681685424107),4,05/25/2020,Memorial Day


In [14]:
both = calls.join(holidays, on=calls["CallDate"] == holidays["date"], how="inner")
both.limit(3).toPandas()

Unnamed: 0,CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,...,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhooods-AnalysisBoundaries,RowID,case_location,AnalysisNeighborhoods,date,holiday
0,201461772,94,20060706,Medical Incident,05/25/2020,05/25/2020,05/25/2020 01:34:55 PM,05/25/2020 01:34:55 PM,05/25/2020 01:36:14 PM,05/25/2020 01:36:19 PM,...,MEDIC,1,10,10,Bayview Hunters Point,201461772-94,POINT (-122.38626545834089 37.73782577837169),1,05/25/2020,Memorial Day
1,201463874,E32,20060913,Medical Incident,05/25/2020,05/25/2020,05/25/2020 11:16:21 PM,05/25/2020 11:17:54 PM,05/25/2020 11:18:27 PM,05/25/2020 11:19:53 PM,...,ENGINE,1,9,11,Excelsior,201463874-E32,POINT (-122.42955213038633 37.729273863971024),7,05/25/2020,Memorial Day
2,201463019,T18,20060826,Medical Incident,05/25/2020,05/25/2020,05/25/2020 06:39:44 PM,05/25/2020 06:41:34 PM,05/25/2020 06:55:28 PM,05/25/2020 06:57:32 PM,...,TRUCK,2,8,7,Inner Sunset,201463019-T18,POINT (-122.46829612745225 37.751748874988515),14,05/25/2020,Memorial Day


In [15]:
# how many calls occured on each type of holiday
both.groupby("holiday").count().toPandas()

                                                                                

Unnamed: 0,holiday,count
0,Thanksgiving Day,8186
1,"Birthday of Martin Luther King, Jr.",9557
2,Veterans Day,8674
3,Independence Day,10457
4,Columbus Day,9973
5,Memorial Day,8830
6,Christmas Day,8179
7,Labor Day,9778
8,New Year's Day,9966
9,Washington's Birthday,9064


In [16]:
# what percent of calls occured on a holiday?

In [17]:
calls2 = calls.join(holidays, on=calls["CallDate"] == holidays["date"], how="left")

In [18]:
calls2.agg(expr("COUNT(holiday) / COUNT(*)")).show()



+---------------------------+
|(COUNT(holiday) / COUNT(1))|
+---------------------------+
|        0.01586255181135282|
+---------------------------+



                                                                                

In [19]:
# how many events are there for each type of holiday, on average?
# SHOULD do a right join, in case there same holidays with 0 events
# calls.join(holidays, calls["CallDate"] == holidays["date"], how="????")

In [20]:
# INNER: because the dataset data ranges don't overlap perfectly
(calls
 .join(holidays, calls["CallDate"] == holidays["date"], how="inner")
 .groupby("date", "holiday")
 .count()
 .groupby("holiday")
 .agg(expr("avg(count)").alias("avg per holiday"),
      expr("count(*)").alias("occurences"))
).toPandas()

                                                                                

Unnamed: 0,holiday,avg per holiday,occurences
0,Thanksgiving Day,744.181818,11
1,"Birthday of Martin Luther King, Jr.",796.416667,12
2,Veterans Day,788.545455,11
3,Independence Day,871.416667,12
4,Columbus Day,831.083333,12
5,Juneteenth National Independence Day,984.0,2
6,Memorial Day,735.833333,12
7,Christmas Day,743.545455,11
8,Inauguration Day,798.0,1
9,Labor Day,814.833333,12


# Do calculations for Jan 1, 2020

In [21]:
jan1 = calls.where("CallDate = '01/01/2020'").repartition(1)
jan1.rdd.getNumPartitions()



1

In [22]:
%%time
jan1.count()



CPU times: user 5.79 ms, sys: 1.35 ms, total: 7.13 ms
Wall time: 2.22 s


                                                                                

1057

In [31]:
jan1.cache()  # just shorthand for memory_only persist

DataFrame[CallNumber: int, UnitID: string, IncidentNumber: int, CallType: string, CallDate: string, WatchDate: string, ReceivedDtTm: string, EntryDtTm: string, DispatchDtTm: string, ResponseDtTm: string, OnSceneDtTm: string, TransportDtTm: string, HospitalDtTm: string, CallFinalDisposition: string, AvailableDtTm: string, Address: string, City: string, ZipcodeofIncident: int, Battalion: string, StationArea: string, Box: string, OriginalPriority: string, Priority: string, FinalPriority: int, ALSUnit: boolean, CallTypeGroup: string, NumberofAlarms: int, UnitType: string, Unitsequenceincalldispatch: int, FirePreventionDistrict: string, SupervisorDistrict: string, Neighborhooods-AnalysisBoundaries: string, RowID: string, case_location: string, AnalysisNeighborhoods: int]

In [36]:
# REVIEW:
# 1. memory_only
# 2. memory_only_ser (save memory, use more CPU)
# 3. disk_only (SSD or HDD) -- save network (disk of workers), but use local disk

# variant:
# NAME_2 -- replicate to two workers (have places we can schedule work, good if one dies)

In [24]:
%%time
jan1.count()

[Stage 45:>                                                         (0 + 1) / 1]

CPU times: user 11.1 ms, sys: 485 µs, total: 11.5 ms
Wall time: 5.98 s


                                                                                

1057

In [34]:
%%time
jan1.count()

CPU times: user 1.47 ms, sys: 608 µs, total: 2.08 ms
Wall time: 73.8 ms


1057

In [26]:
jan1.unpersist()

DataFrame[CallNumber: int, UnitID: string, IncidentNumber: int, CallType: string, CallDate: string, WatchDate: string, ReceivedDtTm: string, EntryDtTm: string, DispatchDtTm: string, ResponseDtTm: string, OnSceneDtTm: string, TransportDtTm: string, HospitalDtTm: string, CallFinalDisposition: string, AvailableDtTm: string, Address: string, City: string, ZipcodeofIncident: int, Battalion: string, StationArea: string, Box: string, OriginalPriority: string, Priority: string, FinalPriority: int, ALSUnit: boolean, CallTypeGroup: string, NumberofAlarms: int, UnitType: string, Unitsequenceincalldispatch: int, FirePreventionDistrict: string, SupervisorDistrict: string, Neighborhooods-AnalysisBoundaries: string, RowID: string, case_location: string, AnalysisNeighborhoods: int]

In [29]:
%%time
jan1.count()



CPU times: user 4.44 ms, sys: 0 ns, total: 4.44 ms
Wall time: 901 ms


                                                                                

1057

# Loan Balance

In [37]:
import requests

# https://spark.apache.org/docs/latest/monitoring.html#rest-api
# http://localhost:4040/api/v1/applications
# http://localhost:4040/api/v1/applications/{app_id}/executors
# look for "totalTasks"

r = requests.get("http://localhost:4040/api/v1/applications")
r.raise_for_status()
r.json()

[{'id': 'app-20230327135429-0000',
  'name': 'cs544',
  'attempts': [{'startTime': '2023-03-27T13:54:27.320GMT',
    'endTime': '1969-12-31T23:59:59.999GMT',
    'lastUpdated': '2023-03-27T13:54:27.320GMT',
    'duration': 0,
    'sparkUser': 'root',
    'completed': False,
    'appSparkVersion': '3.2.2',
    'startTimeEpoch': 1679925267320,
    'lastUpdatedEpoch': 1679925267320,
    'endTimeEpoch': -1}]}]

In [39]:
# before balance: 69 and 72 (on the two workers)
r = requests.get("http://localhost:4040/api/v1/applications/app-20230327135429-0000/executors")
r.raise_for_status()
print(len(r.json()))
print(r.json())

3
[{'id': 'driver', 'hostPort': 'ca8703ce8c32:32829', 'isActive': True, 'rddBlocks': 0, 'memoryUsed': 321816, 'diskUsed': 0, 'totalCores': 0, 'maxTasks': 0, 'activeTasks': 0, 'failedTasks': 0, 'completedTasks': 0, 'totalTasks': 0, 'totalDuration': 0, 'totalGCTime': 0, 'totalInputBytes': 0, 'totalShuffleRead': 0, 'totalShuffleWrite': 0, 'isBlacklisted': False, 'maxMemory': 384093388, 'addTime': '2023-03-27T13:54:29.833GMT', 'executorLogs': {}, 'memoryMetrics': {'usedOnHeapStorageMemory': 321816, 'usedOffHeapStorageMemory': 0, 'totalOnHeapStorageMemory': 384093388, 'totalOffHeapStorageMemory': 0}, 'blacklistedInStages': [], 'peakMemoryMetrics': {'JVMHeapMemory': 370916352, 'JVMOffHeapMemory': 189605256, 'OnHeapExecutionMemory': 0, 'OffHeapExecutionMemory': 0, 'OnHeapStorageMemory': 50500461, 'OffHeapStorageMemory': 0, 'OnHeapUnifiedMemory': 50500461, 'OffHeapUnifiedMemory': 0, 'DirectPoolMemory': 218526, 'MappedPoolMemory': 0, 'ProcessTreeJVMVMemory': 0, 'ProcessTreeJVMRSSMemory': 0, 'Pr

In [41]:
for i in range(10):
    print(jan1.count())

1057
1057
1057
1057
1057
1057
1057
1057
1057
1057


In [42]:
# after balance: 69 and 82 (on the two workers)
r = requests.get("http://localhost:4040/api/v1/applications/app-20230327135429-0000/executors")
r.raise_for_status()
print(len(r.json()))
print(r.json())

3
[{'id': 'driver', 'hostPort': 'ca8703ce8c32:32829', 'isActive': True, 'rddBlocks': 0, 'memoryUsed': 359525, 'diskUsed': 0, 'totalCores': 0, 'maxTasks': 0, 'activeTasks': 0, 'failedTasks': 0, 'completedTasks': 0, 'totalTasks': 0, 'totalDuration': 0, 'totalGCTime': 0, 'totalInputBytes': 0, 'totalShuffleRead': 0, 'totalShuffleWrite': 0, 'isBlacklisted': False, 'maxMemory': 384093388, 'addTime': '2023-03-27T13:54:29.833GMT', 'executorLogs': {}, 'memoryMetrics': {'usedOnHeapStorageMemory': 359525, 'usedOffHeapStorageMemory': 0, 'totalOnHeapStorageMemory': 384093388, 'totalOffHeapStorageMemory': 0}, 'blacklistedInStages': [], 'peakMemoryMetrics': {'JVMHeapMemory': 370916352, 'JVMOffHeapMemory': 190627416, 'OnHeapExecutionMemory': 0, 'OffHeapExecutionMemory': 0, 'OnHeapStorageMemory': 50500461, 'OffHeapStorageMemory': 0, 'OnHeapUnifiedMemory': 50500461, 'OffHeapUnifiedMemory': 0, 'DirectPoolMemory': 219524, 'MappedPoolMemory': 0, 'ProcessTreeJVMVMemory': 0, 'ProcessTreeJVMRSSMemory': 0, 'Pr

# Hash Partitioning

In [43]:
# a hash function
# input: anything (bytes)
# output: number

In [44]:
hash(b"abc")

6074328183178938033

In [45]:
hash(b"abh")

8445670831598163921

In [52]:
hash(b"F") % 4

3

In [53]:
partitions = [[], [], [], []]  # four partitions: the lists indicate the data they are in charge of

In [56]:
for letter in "AAABBECCCDDEFFFFGGHHIJJKAAABBB":
    partition = partitions[hash(letter) % len(partitions)]
    partition.append(letter)

In [57]:
partitions

[['G', 'G', 'I'],
 ['B', 'B', 'E', 'C', 'C', 'C', 'D', 'D', 'E', 'H', 'H', 'B', 'B', 'B'],
 ['J', 'J'],
 ['A', 'A', 'A', 'F', 'F', 'F', 'F', 'K', 'A', 'A', 'A']]

In [59]:
spark.sql("""
SELECT CallType, COUNT(*)
FROM calls
GROUP BY CallType
""")

DataFrame[CallType: string, count(1): bigint]

In [61]:
spark.sql("""
SELECT CallType, COUNT(*)
FROM calls
GROUP BY CallType
""").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[CallType#229], functions=[count(1)])
   +- Exchange hashpartitioning(CallType#229, 200), ENSURE_REQUIREMENTS, [plan_id=1491]
      +- HashAggregate(keys=[CallType#229], functions=[partial_count(1)])
         +- FileScan parquet [CallType#229] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://main:9000/sf.parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<CallType:string>




In [63]:
spark.sql("""
SELECT CallType, COUNT(*)
FROM calls
GROUP BY CallType
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (5)
+- HashAggregate (4)
   +- Exchange (3)
      +- HashAggregate (2)
         +- Scan parquet  (1)


(1) Scan parquet 
Output [1]: [CallType#229]
Batched: true
Location: InMemoryFileIndex [hdfs://main:9000/sf.parquet]
ReadSchema: struct<CallType:string>

(2) HashAggregate
Input [1]: [CallType#229]
Keys [1]: [CallType#229]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#12486L]
Results [2]: [CallType#229, count#12487L]

(3) Exchange
Input [2]: [CallType#229, count#12487L]
Arguments: hashpartitioning(CallType#229, 200), ENSURE_REQUIREMENTS, [plan_id=1504]

(4) HashAggregate
Input [2]: [CallType#229, count#12487L]
Keys [1]: [CallType#229]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#12482L]
Results [2]: [CallType#229, count(1)#12482L AS count(1)#12483L]

(5) AdaptiveSparkPlan
Output [2]: [CallType#229, count(1)#12483L]
Arguments: isFinalPlan=false




In [60]:
spark.sql("""
SELECT CallType, COUNT(*)
FROM calls
GROUP BY CallType
""").toPandas()

                                                                                

Unnamed: 0,CallType,count(1)
0,Elevator / Escalator Rescue,16538
1,Marine Fire,492
2,Aircraft Emergency,1512
3,Confined Space / Structure Collapse,677
4,Administrative,307
5,Alarms,664538
6,Odor (Strange / Unknown),13373
7,Lightning Strike (Investigation),9
8,Citizen Assist / Service Call,89469
9,HazMat,4300


In [65]:
spark.table("calls").sample(True, 0.1).write.bucketBy(10, "CallType").saveAsTable("calls_by_type", mode="overwrite")

23/03/27 15:25:30 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/03/27 15:25:30 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/03/27 15:25:35 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/03/27 15:25:35 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.26.0.2
23/03/27 15:25:35 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
23/03/27 15:25:36 WARN HadoopFSUtils: The directory hdfs://main:9000/user/hive/warehouse/calls_by_type was not found. Was it deleted very recently?
23/03/27 15:25:36 WARN FileUtils: File does not exist: hdfs://main:9000/user/hive/warehouse/calls_by_type; Force to delete it.
23/03/27 15:25:36 ERROR FileUtils: Failed to delete hdfs://main:9000/user/hive/warehouse/calls_by_type
23/03

In [68]:
spark.sql("SELECT CallType, COUNT(*) FROM calls_by_type GROUP BY CallType").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[CallType#12667], functions=[count(1)])
   +- HashAggregate(keys=[CallType#12667], functions=[partial_count(1)])
      +- FileScan parquet default.calls_by_type[CallType#12667] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://main:9000/user/hive/warehouse/calls_by_type], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<CallType:string>, SelectedBucketsCount: 10 out of 10




In [69]:
spark.sql("SELECT CallDate, COUNT(*) FROM calls_by_type GROUP BY CallDate").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[CallDate#12668], functions=[count(1)])
   +- Exchange hashpartitioning(CallDate#12668, 200), ENSURE_REQUIREMENTS, [plan_id=1560]
      +- HashAggregate(keys=[CallDate#12668], functions=[partial_count(1)])
         +- FileScan parquet default.calls_by_type[CallDate#12668] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://main:9000/user/hive/warehouse/calls_by_type], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<CallDate:string>




# Join Strategies (One Computer)

In [70]:
# kind_id, color
fruits = [
    ("B", "Yellow"),
    ("A", "Green"),
    ("C", "Orange"),
    ("A", "Red"),
    ("C", "Purple"),
    ("B", "Green")
]

# kind_id, name (assume no duplicate kind_id's)
kinds = [
    ("A", "Apple"),
    ("B", "Banana"),
    ("C", "Carrot")
]

# GOAL: print Yellow Banana, Green Apple, etc (any order)

## Strategy 1: Hash Table (dict in Python)

In [71]:
lookup = dict(kinds)
lookup

{'A': 'Apple', 'B': 'Banana', 'C': 'Carrot'}

In [75]:
for kind_id, color in fruits:
    print(color, lookup[kind_id])

Yellow Banana
Green Apple
Orange Carrot
Red Apple
Purple Carrot
Green Banana


## Strategy 2: sorting both

In [76]:
fruits.sort()
kinds.sort()

In [79]:
fruits

[('A', 'Green'),
 ('A', 'Red'),
 ('B', 'Green'),
 ('B', 'Yellow'),
 ('C', 'Orange'),
 ('C', 'Purple')]

In [83]:
fruit_idx = 0

for kind_id, name in kinds:
    while fruit_idx < len(fruits) and fruits[fruit_idx][0] <= kind_id:
        if fruits[fruit_idx][0] == kind_id:
            print(fruits[fruit_idx][1], name)
        fruit_idx += 1

Green Apple
Red Apple
Green Banana
Yellow Banana
Orange Carrot
Purple Carrot


# Distributed Join Strategies

In [88]:
(calls
 .join(holidays, calls["CallDate"] == holidays["date"], how="inner")
 .groupby("date", "holiday").count()).explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[date#339, holiday#340], functions=[count(1)])
   +- Exchange hashpartitioning(date#339, holiday#340, 200), ENSURE_REQUIREMENTS, [plan_id=1644]
      +- HashAggregate(keys=[date#339, holiday#340], functions=[partial_count(1)])
         +- Project [date#339, holiday#340]
            +- BroadcastHashJoin [CallDate#230], [date#339], Inner, BuildRight, false
               :- Filter isnotnull(CallDate#230)
               :  +- FileScan parquet [CallDate#230] Batched: true, DataFilters: [isnotnull(CallDate#230)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://main:9000/sf.parquet], PartitionFilters: [], PushedFilters: [IsNotNull(CallDate)], ReadSchema: struct<CallDate:string>
               +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false]),false), [plan_id=1639]
                  +- Filter isnotnull(date#339)
                     +- FileScan csv [date#339,holiday#340] Bat

In [89]:
%%time
(calls
 .join(holidays, calls["CallDate"] == holidays["date"], how="inner")
 .groupby("date", "holiday").count()).toPandas()



CPU times: user 21.1 ms, sys: 0 ns, total: 21.1 ms
Wall time: 2 s


                                                                                

Unnamed: 0,date,holiday,count
0,05/30/2011,Memorial Day,691
1,07/04/2011,Independence Day,787
2,01/18/2016,"Birthday of Martin Luther King, Jr.",852
3,11/24/2016,Thanksgiving Day,722
4,11/11/2011,Veterans Day,744
...,...,...,...
113,10/12/2020,Columbus Day,728
114,11/11/2021,Veterans Day,853
115,06/20/2022,Juneteenth National Independence Day,1125
116,10/11/2021,Columbus Day,839


In [90]:
(calls
 .join(holidays.hint("merge"), calls["CallDate"] == holidays["date"], how="inner")
 .groupby("date", "holiday").count()).explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[date#339, holiday#340], functions=[count(1)])
   +- HashAggregate(keys=[date#339, holiday#340], functions=[partial_count(1)])
      +- Project [date#339, holiday#340]
         +- SortMergeJoin [CallDate#230], [date#339], Inner
            :- Sort [CallDate#230 ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(CallDate#230, 200), ENSURE_REQUIREMENTS, [plan_id=1822]
            :     +- Filter isnotnull(CallDate#230)
            :        +- FileScan parquet [CallDate#230] Batched: true, DataFilters: [isnotnull(CallDate#230)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://main:9000/sf.parquet], PartitionFilters: [], PushedFilters: [IsNotNull(CallDate)], ReadSchema: struct<CallDate:string>
            +- Sort [date#339 ASC NULLS FIRST], false, 0
               +- Exchange hashpartitioning(date#339, 200), ENSURE_REQUIREMENTS, [plan_id=1823]
                  +- Filter isnotnull

In [91]:
%%time
(calls
 .join(holidays.hint("merge"), calls["CallDate"] == holidays["date"], how="inner")
 .groupby("date", "holiday").count()).toPandas()

[Stage 95:>                                                         (0 + 2) / 2]

CPU times: user 24.3 ms, sys: 0 ns, total: 24.3 ms
Wall time: 9.06 s


                                                                                

Unnamed: 0,date,holiday,count
0,01/01/2013,New Year's Day,1047
1,01/01/2014,New Year's Day,1218
2,01/01/2018,New Year's Day,1136
3,01/01/2020,New Year's Day,1057
4,01/01/2021,New Year's Day,795
...,...,...,...
113,12/25/2012,Christmas Day,668
114,12/25/2013,Christmas Day,752
115,12/25/2014,Christmas Day,788
116,12/25/2015,Christmas Day,820
