# Preguntas San Francisco sf-fire.calls

In [2]:
spark

res1: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1539f075


In [3]:
sc

res2: org.apache.spark.SparkContext = org.apache.spark.SparkContext@266dee23


In [4]:
val sampleDF = spark.read
                    .option("samplingRatio",0.001)
                    .option("header",true)
                    .csv("sf-fire-calls.csv")

sampleDF: org.apache.spark.sql.DataFrame = [CallNumber: string, UnitID: string ... 26 more fields]


In [5]:
sampleDF.show(1)

+----------+------+--------------+--------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+---------------+--------------------+-------------+-----+
|CallNumber|UnitID|IncidentNumber|      CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|   Neighborhood|            Location|        RowID|Delay|
+----------+------+--------------+--------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+

In [6]:
import org.apache.spark.sql.types._

import org.apache.spark.sql.types._


### Creacion del Schema

In [7]:
val fire_schema = StructType(Array(
    StructField("CallNumber", IntegerType,true),
    StructField("UnitID", StringType, true),
    StructField("IncidentNumber", IntegerType, true),
    StructField("CallType", StringType, true),
    StructField("CallDate", StringType, true),
    StructField("WatchDate", StringType, true),
    StructField("CallFinalDisposition", StringType, true),
    StructField("AvailableDtTm", StringType, true),
    StructField("Address", StringType, true),
    StructField("City", StringType, true),
    StructField("Zipcode", IntegerType, true),
    StructField("Battalion", StringType, true),
    StructField("StationArea", IntegerType, true),
    StructField("Box", IntegerType, true),
    StructField("OriginalPriority", IntegerType, true),
    StructField("Priority", IntegerType, true),
    StructField("FinalPriority", IntegerType, true),
    StructField("ALSUnit", BooleanType, true),
    StructField("CallTypeGroup", StringType, true),
    StructField("NumAlarms", IntegerType, true),
    StructField("UnitType", StringType, true),
    StructField("UnitSequenceInCallDispatch", IntegerType, true),
    StructField("FirePreventionDistrict", IntegerType, true),
    StructField("SupervisorDistrict", IntegerType, true),
    StructField("Neighborhood", StringType, true),
    StructField("Location",StringType,true),
    StructField("RowID", StringType, true),
    StructField("Delay", FloatType, true)
    ))

fire_schema: org.apache.spark.sql.types.StructType = StructType(StructField(CallNumber,IntegerType,true), StructField(UnitID,StringType,true), StructField(IncidentNumber,IntegerType,true), StructField(CallType,StringType,true), StructField(CallDate,StringType,true), StructField(WatchDate,StringType,true), StructField(CallFinalDisposition,StringType,true), StructField(AvailableDtTm,StringType,true), StructField(Address,StringType,true), StructField(City,StringType,true), StructField(Zipcode,IntegerType,true), StructField(Battalion,StringType,true), StructField(StationArea,IntegerType,true), StructField(Box,IntegerType,true), StructField(OriginalPriority,IntegerType,true), StructField(Priority,IntegerType,true), StructField(FinalPriority,IntegerType,true), StructField(ALSUnit,BooleanType,...


### Importacion de los datos

In [8]:
val df = spark.read.schema(fire_schema)
                    .option("header",true)
                    .option("sep",",")
                    .csv("sf-fire-calls.csv")

df: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [9]:
df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: integer (nullable = true)
 |-- Box: integer (nullable = true)
 |-- OriginalPriority: integer (nullable = true)
 |-- Priority: integer (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: integer (nullable = tr

In [10]:
import org.apache.spark.sql.functions._

import org.apache.spark.sql.functions._


## Pregunta 1: Encontrar los distintos motivos de llamada en 2018

In [11]:
val p1f1 = df.select("CallType","CallDate")
            .withColumn("Fecha",to_timestamp($"CallDate","MM/dd/yyyy")).drop("CallDate")
            .withColumn("Anio",year($"Fecha"))
p1f1.show(3)

+----------------+-------------------+----+
|        CallType|              Fecha|Anio|
+----------------+-------------------+----+
|  Structure Fire|2002-01-11 00:00:00|2002|
|Medical Incident|2002-01-11 00:00:00|2002|
|Medical Incident|2002-01-11 00:00:00|2002|
+----------------+-------------------+----+
only showing top 3 rows



p1f1: org.apache.spark.sql.DataFrame = [CallType: string, Fecha: timestamp ... 1 more field]


In [12]:
val p1f2 = p1f1.select("CallType")
                .where($"Anio"===2018)
                .distinct()
p1f2.show(5,false)

+-----------------------------+
|CallType                     |
+-----------------------------+
|Elevator / Escalator Rescue  |
|Alarms                       |
|Odor (Strange / Unknown)     |
|Citizen Assist / Service Call|
|HazMat                       |
+-----------------------------+
only showing top 5 rows



p1f2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [CallType: string]


In [13]:
p1f2.count()

res7: Long = 20


## Pregunta 2: ¿Qué meses del 2018 tuvieron un mayor número de llamadaS?

In [14]:
val p2f1 = df.select("CallType","CallDate")
            .withColumn("Fecha",to_timestamp($"CallDate","MM/dd/yyyy")).drop("CallDate")
            .withColumn("Anio",year($"Fecha"))
            .withColumn("Mes",month($"Fecha"))
p2f1.show(3)

+----------------+-------------------+----+---+
|        CallType|              Fecha|Anio|Mes|
+----------------+-------------------+----+---+
|  Structure Fire|2002-01-11 00:00:00|2002|  1|
|Medical Incident|2002-01-11 00:00:00|2002|  1|
|Medical Incident|2002-01-11 00:00:00|2002|  1|
+----------------+-------------------+----+---+
only showing top 3 rows



p2f1: org.apache.spark.sql.DataFrame = [CallType: string, Fecha: timestamp ... 2 more fields]


In [15]:
val p2f2 = p2f1.select("Mes")
            .groupBy("Mes")
            .count()
            .withColumnRenamed("count","Cantidad")
            .orderBy($"Cantidad" desc)
p2f2.show(3)

+---+--------+
|Mes|Cantidad|
+---+--------+
| 10|   15410|
|  8|   15126|
|  5|   15099|
+---+--------+
only showing top 3 rows



p2f2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Mes: int, Cantidad: bigint]


## Pregunta 3: ¿Qué barrio de San Francisco genera más llamadas en 2018?

In [16]:
df.printSchema

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: integer (nullable = true)
 |-- Box: integer (nullable = true)
 |-- OriginalPriority: integer (nullable = true)
 |-- Priority: integer (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: integer (nullable = tr

In [17]:
val p3f1 = df.select("Neighborhood")
            .groupBy("Neighborhood")
            .count()
            .withColumnRenamed("count","Cantidad")
            .orderBy($"Cantidad" desc)
            .show(3)

+---------------+--------+
|   Neighborhood|Cantidad|
+---------------+--------+
|     Tenderloin|   22785|
|South of Market|   16623|
|        Mission|   15912|
+---------------+--------+
only showing top 3 rows



p3f1: Unit = ()


## Pregunta 4: ¿Qué barrio de San Francisco tiene el peor tiempo de respuesta?

In [18]:
val p4 = df.select("Neighborhood","CallDate","Delay")
            .withColumn("Fecha",to_timestamp($"CallDate","MM/dd/yyyy"))
            .withColumn("Anio",year($"Fecha"))
p4.show(2)

+--------------------+----------+-----+-------------------+----+
|        Neighborhood|  CallDate|Delay|              Fecha|Anio|
+--------------------+----------+-----+-------------------+----+
|     Pacific Heights|01/11/2002| 2.95|2002-01-11 00:00:00|2002|
|Bayview Hunters P...|01/11/2002|  4.7|2002-01-11 00:00:00|2002|
+--------------------+----------+-----+-------------------+----+
only showing top 2 rows



p4: org.apache.spark.sql.DataFrame = [Neighborhood: string, CallDate: string ... 3 more fields]


In [19]:
val p4f2 = p4.select("Neighborhood","Delay")
            .where($"Anio"===2018)
            .groupBy("Neighborhood")
            .avg("Delay")
            .withColumnRenamed("avg(Delay)","Media")
            .orderBy($"Media" desc)
p4f2.show(2)

+------------+-----------------+
|Neighborhood|            Media|
+------------+-----------------+
|   Chinatown|6.190314101143033|
|    Presidio|5.829227011272873|
+------------+-----------------+
only showing top 2 rows



p4f2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Neighborhood: string, Media: double]


In [20]:
val p5 = df.select("CallDate")
            .withColumn("Fecha",to_timestamp($"CallDate","MM/dd/yyyy"))
            .withColumn("Anio",year($"Fecha"))

p5: org.apache.spark.sql.DataFrame = [CallDate: string, Fecha: timestamp ... 1 more field]


In [21]:
val p5f2 = p5.select("Fecha")
            .where($"Anio"===2018)
            .withColumn("week_of_year", weekofyear($"Fecha"))
            .withColumnRenamed("week_of_year","Semana")
            .groupBy("Semana")
            .count()
            .withColumnRenamed("count","Cantidad")
            .orderBy($"Cantidad" desc)
p5f2.show(3)

+------+--------+
|Semana|Cantidad|
+------+--------+
|    22|     259|
|    40|     255|
|    43|     250|
+------+--------+
only showing top 3 rows



p5f2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Semana: int, Cantidad: bigint]


In [22]:
df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: integer (nullable = true)
 |-- Box: integer (nullable = true)
 |-- OriginalPriority: integer (nullable = true)
 |-- Priority: integer (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: integer (nullable = tr

In [26]:
val p6 = df.select("Neighborhood","ZipCode")
            .groupBy("Neighborhood","ZipCode")
            .count()
p6.show()

+--------------------+-------+-----+
|        Neighborhood|ZipCode|count|
+--------------------+-------+-----+
|        Inner Sunset|  94122| 2161|
|Bayview Hunters P...|  94124| 9150|
|        Inner Sunset|  94114|   20|
|  West of Twin Peaks|  94112|  760|
|      Haight Ashbury|  94114|   21|
|           Glen Park|  94110|   25|
|           Excelsior|  94112| 3237|
|        Russian Hill|  94109| 2261|
|                None|  94124|    7|
|           Chinatown|  94133| 1861|
|     Pacific Heights|  94115| 2100|
|Oceanview/Merced/...|  94127|   12|
|        Potrero Hill|  94103|    5|
|        Inner Sunset|  94117|  224|
|    Golden Gate Park|  94117|  107|
|                None|   null|  141|
|          Noe Valley|  94131|  763|
|    Western Addition|  94117|  315|
|        McLaren Park|  94112|   36|
|      Outer Richmond|  94121| 4121|
+--------------------+-------+-----+
only showing top 20 rows



p6: org.apache.spark.sql.DataFrame = [Neighborhood: string, ZipCode: int ... 1 more field]


In [87]:
df.write.format("parquet").save("EndToEndDF")

In [27]:
p6.stat.corr("ZipCode","count")

res19: Double = 0.06771510346381836
