In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HelloSpark") \
    .master("local[*]") \
    .getOrCreate()

In [14]:
fire_df = spark.read \
            .format("csv") \
            .option("header", True) \
            .option("inferSchema", True) \
            .load(r"C:\Users\chasurag\Documents\my repos\Fire_Department_and_Emergency_Medical_Services_Dispatched_Calls_for_Service_20250806.csv")

# fire_df.printSchema()

In [15]:
fire_df.show(10)

+-----------+-------+---------------+-----------------+----------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+-------------+-------------------+---------+------------+----+-----------------+--------+--------------+--------+--------------------+----------------+---------+------------------------------+------------------------+-------------------+------------------------------------+---------------+--------------------+--------------------+--------------------+
|Call Number|Unit ID|Incident Number|        Call Type| Call Date|Watch Date|       Received DtTm|          Entry DtTm|       Dispatch DtTm|       Response DtTm|       On Scene DtTm|      Transport DtTm|       Hospital DtTm|Call Final Disposition|      Available DtTm|             Address|         City|Zipcode of Incident|Battalion|Station Area| Box|Origin

In [16]:
fire_df.createOrReplaceTempView("fire_view")
spark.sql("""
    SELECT 
        `Received DtTm` 
    FROM fire_view 
    LIMIT 10
""").show(truncate = False)

+----------------------+
|Received DtTm         |
+----------------------+
|01/02/2024 11:00:26 AM|
|01/02/2024 09:33:02 AM|
|01/02/2024 09:09:52 PM|
|01/02/2024 11:22:41 AM|
|01/02/2024 05:22:16 PM|
|01/02/2024 05:01:12 PM|
|01/02/2024 11:31:10 PM|
|01/02/2024 06:27:35 AM|
|01/02/2024 07:59:33 PM|
|01/02/2024 02:06:54 AM|
+----------------------+



In [17]:
spark.sql("""
          select concat(year(to_date(`Received DtTm`, 'MM/dd/yyyy hh:mm:ss a')), '-', month(to_date(`Received DtTm`, 'MM/dd/yyyy hh:mm:ss a'))) as received_date_month, count(*)
          from fire_view
          group by concat(year(to_date(`Received DtTm`, 'MM/dd/yyyy hh:mm:ss a')), '-', month(to_date(`Received DtTm`, 'MM/dd/yyyy hh:mm:ss a')))
          """).show(truncate = False)

+-------------------+--------+
|received_date_month|count(1)|
+-------------------+--------+
|2024-2             |29526   |
|2024-1             |30495   |
|2024-3             |30453   |
|2024-4             |28557   |
|2024-5             |29630   |
|2024-6             |28500   |
|2024-8             |30447   |
|2024-7             |29833   |
|2024-9             |30158   |
|2024-10            |32360   |
|2024-12            |32832   |
|2024-11            |30665   |
|2025-1             |33196   |
|2025-2             |30434   |
|2025-4             |29347   |
|2025-3             |30744   |
|2025-5             |30638   |
|2025-6             |29130   |
|2025-7             |29681   |
|2025-8             |3917    |
+-------------------+--------+



In [18]:
spark.sql("""
          select weekofyear(to_date(`Call Date`, 'MM/dd/yyyy')) as week_num, count(*) as num_of_calls
          from fire_view
          where year(to_date(`Call Date`, 'MM/dd/yyyy')) = 2024
          group by weekofyear(to_date(`Call Date`, 'MM/dd/yyyy'))
          order by count(*) desc
          """
        ).show(truncate = False)


+--------+------------+
|week_num|num_of_calls|
+--------+------------+
|40      |8344        |
|1       |8172        |
|50      |7777        |
|51      |7659        |
|6       |7439        |
|45      |7432        |
|41      |7416        |
|2       |7312        |
|5       |7301        |
|47      |7269        |
|39      |7262        |
|4       |7261        |
|35      |7179        |
|49      |7168        |
|52      |7129        |
|32      |7101        |
|42      |7041        |
|27      |7028        |
|8       |7020        |
|36      |7007        |
+--------+------------+
only showing top 20 rows



In [19]:
fire_df.describe(["Call Number"]).show()

+-------+--------------------+
|summary|         Call Number|
+-------+--------------------+
|  count|              580543|
|   mean|2.4529878770287645E8|
| stddev|   4548867.435147642|
|    min|           240020004|
|    max|           252170216|
+-------+--------------------+

