## Analyzing Fire Department Calls and Holidays

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/03 01:45:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
! wget https://raw.githubusercontent.com/tylerharter/us-federal-holidays/main/holidays2.csv

--2023-11-03 01:46:02--  https://raw.githubusercontent.com/tylerharter/us-federal-holidays/main/holidays2.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3495 (3.4K) [text/plain]
Saving to: ‘holidays2.csv’


2023-11-03 01:46:02 (24.7 MB/s) - ‘holidays2.csv’ saved [3495/3495]



In [3]:
! hdfs dfs -cp holidays2.csv hdfs://nn:9000/

In [4]:
(spark.read
 .format("parquet")
 .load("hdfs://nn:9000/sf.parquet")
 .createOrReplaceTempView("calls")
)

23/11/03 01:46:41 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [5]:
(spark.read
 .format("csv")
 .option("inferSchema", True)
 .option("header", True)
 .load("hdfs://nn:9000/holidays2.csv")
 .createOrReplaceTempView("holidays")
)

                                                                                

In [6]:
spark.table("calls")

DataFrame[Call_Number: int, Unit_ID: string, Incident_Number: int, Call_Type: string, Call_Date: string, Watch_Date: string, Received_DtTm: string, Entry_DtTm: string, Dispatch_DtTm: string, Response_DtTm: string, On_Scene_DtTm: string, Transport_DtTm: string, Hospital_DtTm: string, Call_Final_Disposition: string, Available_DtTm: string, Address: string, City: string, Zipcode_of_Incident: int, Battalion: string, Station_Area: string, Box: string, Original_Priority: string, Priority: string, Final_Priority: int, ALS_Unit: boolean, Call_Type_Group: string, Number_of_Alarms: int, Unit_Type: string, Unit_sequence_in_call_dispatch: int, Fire_Prevention_District: string, Supervisor_District: string, Neighborhooods_-_Analysis_Boundaries: string, RowID: string, case_location: string, Analysis_Neighborhoods: int]

In [7]:
spark.table("holidays")

DataFrame[date: string, holiday: string]

In [9]:
spark.sql("""
SHOW TABLES
""").show()

23/11/03 01:51:56 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/03 01:51:56 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/11/03 01:52:01 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/11/03 01:52:01 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.25.0.6
23/11/03 01:52:01 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|   stinky|      false|
|         |    calls|       true|
|         | holidays|       true|
+---------+---------+-----------+



In [11]:
# Can we pair up calls with holidays (using SQL)?
spark.sql("""
SELECT
    Call_Number
    , Call_Date
    , holiday
FROM
    calls
    INNER JOIN holidays ON calls.Call_Date = holidays.date
LIMIT
    10
""").toPandas()

                                                                                

Unnamed: 0,Call_Number,Call_Date,holiday
0,201461772,05/25/2020,Memorial Day
1,201463874,05/25/2020,Memorial Day
2,201463019,05/25/2020,Memorial Day
3,201460900,05/25/2020,Memorial Day
4,201463227,05/25/2020,Memorial Day
5,201460714,05/25/2020,Memorial Day
6,201463595,05/25/2020,Memorial Day
7,201463264,05/25/2020,Memorial Day
8,201463403,05/25/2020,Memorial Day
9,201463254,05/25/2020,Memorial Day


In [12]:
# Can we pair up calls with holidays (using Spark DataFrames)?

# First, easier to write the queries if we pull these out as DataFrames
calls = spark.table("calls")
holidays = spark.table("holidays")

In [14]:
type(calls)
type(holidays)

pyspark.sql.dataframe.DataFrame

In [15]:
calls["Call_Date"] == holidays["date"]    # Hasn't actually evaluated yet

Column<'(Call_Date = date)'>

In [19]:
# full Spark DataFrame example; same result as SQL query
(calls.join(holidays, on=calls["Call_Date"] == holidays["date"], how="inner")
 .select("Call_Number", "Call_Date", "holiday")
 .limit(10)
).toPandas()

Unnamed: 0,Call_Number,Call_Date,holiday
0,201461772,05/25/2020,Memorial Day
1,201463874,05/25/2020,Memorial Day
2,201463019,05/25/2020,Memorial Day
3,201460900,05/25/2020,Memorial Day
4,201463227,05/25/2020,Memorial Day
5,201460714,05/25/2020,Memorial Day
6,201463595,05/25/2020,Memorial Day
7,201463264,05/25/2020,Memorial Day
8,201463403,05/25/2020,Memorial Day
9,201463254,05/25/2020,Memorial Day


In [24]:
# how many calls occurred on each holiday (using Spark DataFrames)?
(calls.join(holidays, on=calls["Call_Date"] == holidays["date"], how="inner")
 .select("holiday")
 .groupBy("holiday")
 .count()
 .orderBy("count", ascending=False)
).toPandas()

                                                                                

Unnamed: 0,holiday,count
0,Independence Day,10457
1,Columbus Day,9973
2,New Year's Day,9966
3,Labor Day,9778
4,"Birthday of Martin Luther King, Jr.",9557
5,Washington's Birthday,9064
6,Memorial Day,8830
7,Veterans Day,8674
8,Thanksgiving Day,8186
9,Christmas Day,8179


In [27]:
# What percent of fire dept. calls are on holidays? (Using SQL)
spark.sql("""
SELECT
    (COUNT(holiday) / COUNT(*)) * 100 as `Percent Holiday Calls`
FROM
    calls
    LEFT OUTER JOIN holidays ON calls.Call_Date = holidays.date
""").toPandas()

                                                                                

Unnamed: 0,Percent Holiday Calls
0,1.586255


In [30]:
from pyspark.sql.functions import col, expr

In [34]:
# What percent of fire dept. calls are on holidays? (Using Spark DataFrames)
(calls.join(holidays, on=calls["Call_Date"] == holidays["date"], how="left")
 .agg(expr("COUNT(holiday) * 100 / COUNT(*)").alias("Percent Holiday Calls"))
).toPandas()

                                                                                

Unnamed: 0,Percent Holiday Calls
0,1.586255
