# Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, row_number
from pyspark.sql.window import Window

spark = (SparkSession.builder.appName("cs544")
         .master("spark://main:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://main:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/24 14:03:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# ! wget https://pages.cs.wisc.edu/~harter/cs639/data/sf.zip
# ! unzip sf.zip

In [3]:
!hdfs dfs -cp sf.csv hdfs://main:9000/sf.csv

In [4]:
df = (spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load("hdfs://main:9000/sf.csv"))
columns = [col(c).alias(c.replace(" ", "")) for c in df.columns]
df.select(columns).write.format("parquet").mode("overwrite").save("hdfs://main:9000/sf.parquet")

23/03/24 14:04:33 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
df = spark.read.format("parquet").load("hdfs://main:9000/sf.parquet")
df.createOrReplaceTempView("calls")

# Demos

### Windowing

In [10]:
# what are the 3 biggest call types, in each call type group
spark.sql("""
SELECT CallTypeGroup, CallType, COUNT(*) as count
FROM calls
GROUP BY CallTypeGroup, CallType
""").toPandas()

                                                                                

Unnamed: 0,CallTypeGroup,CallType,count
0,,Elevator / Escalator Rescue,7062
1,,Marine Fire,222
2,,Aircraft Emergency,1263
3,Fire,Aircraft Emergency,186
4,,Confined Space / Structure Collapse,286
...,...,...,...
73,Fire,Train / Rail Fire,10
74,Alarm,Vehicle Fire,13
75,Fire,Administrative,1
76,Alarm,Oil Spill,5


In [13]:
spark.sql("""
SELECT CallTypeGroup, CallType, count,
       row_number() OVER (PARTITION BY CallTypeGroup ORDER BY count DESC)
FROM
(
    SELECT CallTypeGroup, CallType, COUNT(*) as count
    FROM calls
    GROUP BY CallTypeGroup, CallType
)
""").toPandas()

                                                                                

Unnamed: 0,CallTypeGroup,CallType,count,row_number() OVER (PARTITION BY CallTypeGroup ORDER BY count DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
0,,Medical Incident,1783402,1
1,,Structure Fire,416250,2
2,,Alarms,298422,3
3,,Traffic Collision,107136,4
4,,Other,50967,5
...,...,...,...,...
73,Potentially Life-Threatening,Medical Incident,1448636,1
74,Potentially Life-Threatening,Traffic Collision,86830,2
75,Potentially Life-Threatening,Other,1763,3
76,Potentially Life-Threatening,Water Rescue,93,4


In [17]:
spark.sql("""
SELECT CallTypeGroup, CallType, count,
       row_number() OVER (PARTITION BY CallTypeGroup ORDER BY count DESC) AS num
FROM
(
    SELECT CallTypeGroup, CallType, COUNT(*) as count
    FROM calls
    GROUP BY CallTypeGroup, CallType
)
""").where("num <= 3").toPandas()

                                                                                

Unnamed: 0,CallTypeGroup,CallType,count,num
0,,Medical Incident,1783402,1
1,,Structure Fire,416250,2
2,,Alarms,298422,3
3,Alarm,Alarms,366116,1
4,Alarm,Structure Fire,245241,2
5,Alarm,Citizen Assist / Service Call,43403,3
6,Fire,Structure Fire,51061,1
7,Fire,Outside Fire,40752,2
8,Fire,Water Rescue,22007,3
9,Non Life-threatening,Medical Incident,716653,1


### Joining

In [18]:
! wget https://raw.githubusercontent.com/tylerharter/us-federal-holidays/main/holidays2.csv
! hdfs dfs -cp holidays2.csv hdfs://main:9000/
holidays = (spark.read.format("csv").option("header", True)
            .load("hdfs://main:9000/holidays2.csv"))

--2023-03-24 15:08:24--  https://raw.githubusercontent.com/tylerharter/us-federal-holidays/main/holidays2.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3495 (3.4K) [text/plain]
Saving to: ‘holidays2.csv.1’


2023-03-24 15:08:25 (28.6 MB/s) - ‘holidays2.csv.1’ saved [3495/3495]



In [19]:
holidays.toPandas()

Unnamed: 0,date,holiday
0,01/01/2013,New Year's Day
1,01/01/2014,New Year's Day
2,01/01/2015,New Year's Day
3,01/01/2016,New Year's Day
4,01/01/2018,New Year's Day
...,...,...
117,12/25/2020,Christmas Day
118,12/26/2011,Christmas Day
119,12/26/2016,Christmas Day
120,12/26/2022,Christmas Day


In [23]:
calls = spark.table("calls")

In [25]:
calls["CallDate"] == holidays["date"]

Column<'(CallDate = date)'>

In [26]:
# can we associate a holiday with a call?
# BY DEFAULT: inner join
calls.join(holidays, on=calls["CallDate"] == holidays["date"]).limit(10).toPandas()

                                                                                

Unnamed: 0,CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,...,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhooods-AnalysisBoundaries,RowID,case_location,AnalysisNeighborhoods,date,holiday
0,201461772,94,20060706,Medical Incident,05/25/2020,05/25/2020,05/25/2020 01:34:55 PM,05/25/2020 01:34:55 PM,05/25/2020 01:36:14 PM,05/25/2020 01:36:19 PM,...,MEDIC,1,10,10,Bayview Hunters Point,201461772-94,POINT (-122.38626545834089 37.73782577837169),1,05/25/2020,Memorial Day
1,201463874,E32,20060913,Medical Incident,05/25/2020,05/25/2020,05/25/2020 11:16:21 PM,05/25/2020 11:17:54 PM,05/25/2020 11:18:27 PM,05/25/2020 11:19:53 PM,...,ENGINE,1,9,11,Excelsior,201463874-E32,POINT (-122.42955213038633 37.729273863971024),7,05/25/2020,Memorial Day
2,201463019,T18,20060826,Medical Incident,05/25/2020,05/25/2020,05/25/2020 06:39:44 PM,05/25/2020 06:41:34 PM,05/25/2020 06:55:28 PM,05/25/2020 06:57:32 PM,...,TRUCK,2,8,7,Inner Sunset,201463019-T18,POINT (-122.46829612745225 37.751748874988515),14,05/25/2020,Memorial Day
3,201460900,E11,20060611,Medical Incident,05/25/2020,05/25/2020,05/25/2020 09:22:49 AM,05/25/2020 09:24:51 AM,05/25/2020 09:25:17 AM,05/25/2020 09:26:11 AM,...,ENGINE,1,6,8,Mission,201460900-E11,POINT (-122.42413130758577 37.752703146369065),20,05/25/2020,Memorial Day
4,201463227,B03,20060859,Alarms,05/25/2020,05/25/2020,05/25/2020 07:40:19 PM,05/25/2020 07:41:33 PM,05/25/2020 07:41:44 PM,05/25/2020 07:42:33 PM,...,CHIEF,2,3,6,Financial District/South Beach,201463227-B03,POINT (-122.40169229261626 37.78658885907634),8,05/25/2020,Memorial Day
5,201460714,E26,20060590,Medical Incident,05/25/2020,05/25/2020,05/25/2020 08:23:14 AM,05/25/2020 08:23:14 AM,05/25/2020 08:31:53 AM,05/25/2020 08:34:00 AM,...,ENGINE,2,6,8,Glen Park,201460714-E26,POINT (-122.4396375225712 37.73654095477352),10,05/25/2020,Memorial Day
6,201463595,RC1,20060892,Medical Incident,05/25/2020,05/25/2020,05/25/2020 09:24:50 PM,05/25/2020 09:26:27 PM,05/25/2020 09:27:04 PM,05/25/2020 09:27:44 PM,...,RESCUE CAPTAIN,1,3,6,Financial District/South Beach,201463595-RC1,POINT (-122.39638047787408 37.787304005252835),8,05/25/2020,Memorial Day
7,201463264,54,20060863,Medical Incident,05/25/2020,05/25/2020,05/25/2020 07:52:29 PM,05/25/2020 07:54:40 PM,05/25/2020 07:54:56 PM,05/25/2020 07:55:02 PM,...,MEDIC,3,10,10,Potrero Hill,201463264-54,POINT (-122.39724892486004 37.759967407690915),26,05/25/2020,Memorial Day
8,201463403,56,20060881,Medical Incident,05/25/2020,05/25/2020,05/25/2020 08:35:42 PM,05/25/2020 08:36:38 PM,05/25/2020 08:36:55 PM,05/25/2020 08:37:01 PM,...,MEDIC,2,1,3,Nob Hill,201463403-56,POINT (-122.4083891465028 37.79026526175803),21,05/25/2020,Memorial Day
9,201463254,63,20060862,Medical Incident,05/25/2020,05/25/2020,05/25/2020 07:52:16 PM,05/25/2020 07:52:16 PM,05/25/2020 07:52:43 PM,05/25/2020 07:52:47 PM,...,MEDIC,2,3,10,Mission Bay,201463254-63,POINT (-122.39549294787926 37.76681685424107),4,05/25/2020,Memorial Day


In [31]:
both = calls.join(holidays, on=calls["CallDate"] == holidays["date"], how="inner")
both.limit(3).toPandas()

Unnamed: 0,CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,...,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhooods-AnalysisBoundaries,RowID,case_location,AnalysisNeighborhoods,date,holiday
0,201461772,94,20060706,Medical Incident,05/25/2020,05/25/2020,05/25/2020 01:34:55 PM,05/25/2020 01:34:55 PM,05/25/2020 01:36:14 PM,05/25/2020 01:36:19 PM,...,MEDIC,1,10,10,Bayview Hunters Point,201461772-94,POINT (-122.38626545834089 37.73782577837169),1,05/25/2020,Memorial Day
1,201463874,E32,20060913,Medical Incident,05/25/2020,05/25/2020,05/25/2020 11:16:21 PM,05/25/2020 11:17:54 PM,05/25/2020 11:18:27 PM,05/25/2020 11:19:53 PM,...,ENGINE,1,9,11,Excelsior,201463874-E32,POINT (-122.42955213038633 37.729273863971024),7,05/25/2020,Memorial Day
2,201463019,T18,20060826,Medical Incident,05/25/2020,05/25/2020,05/25/2020 06:39:44 PM,05/25/2020 06:41:34 PM,05/25/2020 06:55:28 PM,05/25/2020 06:57:32 PM,...,TRUCK,2,8,7,Inner Sunset,201463019-T18,POINT (-122.46829612745225 37.751748874988515),14,05/25/2020,Memorial Day


In [33]:
# how many calls occured on each type of holiday
both.groupby("holiday").count().toPandas()

                                                                                

Unnamed: 0,holiday,count
0,Independence Day,10457
1,Memorial Day,8830
2,Thanksgiving Day,8186
3,"Birthday of Martin Luther King, Jr.",9557
4,Veterans Day,8674
5,Columbus Day,9973
6,Christmas Day,8179
7,Labor Day,9778
8,New Year's Day,9966
9,Washington's Birthday,9064


In [None]:
# what percent of calls occured on a holiday?

In [34]:
calls2 = calls.join(holidays, on=calls["CallDate"] == holidays["date"], how="left")

In [35]:
calls2.agg(expr("COUNT(holiday) / COUNT(*)")).show()



+---------------------------+
|(COUNT(holiday) / COUNT(1))|
+---------------------------+
|        0.01586255181135282|
+---------------------------+



                                                                                

In [36]:
# how many events are there for each type of holiday, on average?
# SHOULD do a right join, in case there same holidays with 0 events
# calls.join(holidays, calls["CallDate"] == holidays["date"], how="????")

In [41]:
# INNER: because the dataset data ranges don't overlap perfectly
(calls
 .join(holidays, calls["CallDate"] == holidays["date"], how="inner")
 .groupby("date", "holiday")
 .count()
 .groupby("holiday")
 .agg(expr("avg(count)").alias("avg per holiday"),
      expr("count(*)").alias("occurences"))
).toPandas()

                                                                                

Unnamed: 0,holiday,avg per holiday,occurences
0,Thanksgiving Day,744.181818,11
1,"Birthday of Martin Luther King, Jr.",796.416667,12
2,Veterans Day,788.545455,11
3,Independence Day,871.416667,12
4,Columbus Day,831.083333,12
5,Juneteenth National Independence Day,984.0,2
6,Memorial Day,735.833333,12
7,Christmas Day,743.545455,11
8,Inauguration Day,798.0,1
9,Labor Day,814.833333,12


# Do calculations for Jan 1, 2020

In [47]:
jan1 = calls.where("CallDate = '01/01/2020'").repartition(1)
jan1.rdd.getNumPartitions()



1

In [51]:
%%time
jan1.count()

CPU times: user 2.42 ms, sys: 0 ns, total: 2.42 ms
Wall time: 692 ms


1057

In [52]:
jan1.cache()  # just shorthand for memory_only persist

DataFrame[CallNumber: int, UnitID: string, IncidentNumber: int, CallType: string, CallDate: string, WatchDate: string, ReceivedDtTm: string, EntryDtTm: string, DispatchDtTm: string, ResponseDtTm: string, OnSceneDtTm: string, TransportDtTm: string, HospitalDtTm: string, CallFinalDisposition: string, AvailableDtTm: string, Address: string, City: string, ZipcodeofIncident: int, Battalion: string, StationArea: string, Box: string, OriginalPriority: string, Priority: string, FinalPriority: int, ALSUnit: boolean, CallTypeGroup: string, NumberofAlarms: int, UnitType: string, Unitsequenceincalldispatch: int, FirePreventionDistrict: string, SupervisorDistrict: string, Neighborhooods-AnalysisBoundaries: string, RowID: string, case_location: string, AnalysisNeighborhoods: int]

In [53]:
%%time
jan1.count()



CPU times: user 4.39 ms, sys: 1.54 ms, total: 5.93 ms
Wall time: 2.63 s


                                                                                

1057

In [57]:
%%time
jan1.count()  

CPU times: user 2.17 ms, sys: 0 ns, total: 2.17 ms
Wall time: 65.1 ms


1057

In [59]:
jan1.unpersist()

DataFrame[CallNumber: int, UnitID: string, IncidentNumber: int, CallType: string, CallDate: string, WatchDate: string, ReceivedDtTm: string, EntryDtTm: string, DispatchDtTm: string, ResponseDtTm: string, OnSceneDtTm: string, TransportDtTm: string, HospitalDtTm: string, CallFinalDisposition: string, AvailableDtTm: string, Address: string, City: string, ZipcodeofIncident: int, Battalion: string, StationArea: string, Box: string, OriginalPriority: string, Priority: string, FinalPriority: int, ALSUnit: boolean, CallTypeGroup: string, NumberofAlarms: int, UnitType: string, Unitsequenceincalldispatch: int, FirePreventionDistrict: string, SupervisorDistrict: string, Neighborhooods-AnalysisBoundaries: string, RowID: string, case_location: string, AnalysisNeighborhoods: int]

In [61]:
%%time
jan1.count()

CPU times: user 0 ns, sys: 2.38 ms, total: 2.38 ms
Wall time: 552 ms


1057