In [1]:
# This file will generate insightful plots and data related to sidewalk cafe application data
!pip -q install pyspark
!pip -q install altair
!pip -q install pandas

from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import weekofyear
from pyspark.sql import functions as f
from pyspark.sql.functions import concat_ws
from pyspark.sql.types import StringType
import altair as alt
import pandas as pd

spark = SparkSession.builder.appName("Agg_Hotness-Max").getOrCreate()

In [2]:
cafe = spark.read.format('csv').options(header='true', inferschema='true').load("New_Sidewalk_Cafe_Applications.csv")
cafe = cafe.select("LICENSE_NBR","APP_STATUS_DATE","INTAKE_DD","ISSUANCE","SWC_SQ_FT")
cafe.show()

+-----------+---------------+----------+--------------+---------+
|LICENSE_NBR|APP_STATUS_DATE| INTAKE_DD|      ISSUANCE|SWC_SQ_FT|
+-----------+---------------+----------+--------------+---------+
|2090193-DCA|     01/07/2020|06/06/2019|        Issued|      335|
|1385278-DCA|     12/23/2019|12/20/2019|Pending Review|       52|
|1206030-DCA|     03/05/2020|12/11/2019|Pending Review|      285|
|2023941-DCA|     02/27/2019|02/25/2019|        Issued|      148|
|2021840-DCA|     06/21/2019|02/12/2019|        Issued|      294|
|2082893-DCA|     05/28/2019|12/27/2018|        Issued|      111|
|2026141-DCA|     04/09/2019|04/09/2019|        Issued|       66|
|1282959-DCA|     06/26/2019|09/04/2018|        Issued|      143|
|1145512-DCA|     01/13/2020|12/18/2019|Pending Review|      405|
|1420350-DCA|     01/09/2020|12/26/2018|        Issued|      326|
|2034028-DCA|     05/02/2019|12/31/2018|        Issued|      446|
|0959237-DCA|     12/30/2019|04/16/2019|Pending Review|      289|
|1252802-D

In [3]:
cafe = cafe.withColumn("intake_year", f.substring("INTAKE_DD", 7, 4)).withColumn("intake_day", f.substring("INTAKE_DD", 4,2)).withColumn("intake_month", f.substring("INTAKE_DD",1,2))
cafe = cafe.where((cafe["intake_year"] == '2019') | (cafe["intake_year"] == '2020')).orderBy(cafe["intake_year"].desc(),cafe["intake_month"].desc(),cafe["intake_day"].desc())
cafe.show()

+-----------+---------------+----------+--------------+---------+-----------+----------+------------+
|LICENSE_NBR|APP_STATUS_DATE| INTAKE_DD|      ISSUANCE|SWC_SQ_FT|intake_year|intake_day|intake_month|
+-----------+---------------+----------+--------------+---------+-----------+----------+------------+
|0949696-DCA|     06/19/2020|06/19/2020|Pending Review|      414|       2020|        19|          06|
|1305734-DCA|     06/18/2020|06/18/2020|Pending Review|      412|       2020|        18|          06|
|1026947-DCA|     06/18/2020|06/18/2020|Pending Review|      294|       2020|        18|          06|
|1347915-DCA|     06/16/2020|06/16/2020|Pending Review|      438|       2020|        16|          06|
|1190138-DCA|     06/12/2020|06/12/2020|        Issued|      336|       2020|        12|          06|
|1139335-DCA|     06/09/2020|06/09/2020|Pending Review|      575|       2020|        09|          06|
|2079555-DCA|     06/17/2020|05/13/2020|Pending Review|      381|       2020|     

The latest application data we have for 2020 is 06/19/2020, but for 2019 we have the entire year.

In [4]:
num_2019_apps = cafe.where((cafe["intake_year"] == '2019')).count()
num_2020_apps = cafe.where((cafe["intake_year"] == '2020')).count()
print("The number of applications in 2019 was: %d" % num_2019_apps)
print("The number of applications in 2020 was: %d" % num_2020_apps)

The number of applications in 2019 was: 583
The number of applications in 2020 was: 114


In [5]:
num_2019_apps_per_month = cafe.where((cafe["intake_year"] == '2019')).groupBy("intake_month").count().orderBy("intake_month")
num_2020_apps_per_month = cafe.where((cafe["intake_year"] == '2020')).groupBy("intake_month").count().orderBy("intake_month")
num_2019_apps_per_month.show()
num_2020_apps_per_month.show()

+------------+-----+
|intake_month|count|
+------------+-----+
|          01|   43|
|          02|   41|
|          03|   65|
|          04|  110|
|          05|   60|
|          06|   15|
|          07|   16|
|          08|   39|
|          09|   80|
|          10|   21|
|          11|   31|
|          12|   62|
+------------+-----+

+------------+-----+
|intake_month|count|
+------------+-----+
|          01|   33|
|          02|   58|
|          03|   13|
|          04|    3|
|          05|    1|
|          06|    6|
+------------+-----+



The number of applications during the beginning of the pandemic plummeted, and we dont have data that goes further. 

In [6]:
sqft_2019_apps_per_month = cafe.where((cafe["intake_year"] == '2019')).groupBy("intake_month").mean("SWC_SQ_FT").orderBy("intake_month")
sqft_2020_apps_per_month = cafe.where((cafe["intake_year"] == '2020')).groupBy("intake_month").mean("SWC_SQ_FT").orderBy("intake_month")
pd_sqft_2019_apps_per_month = sqft_2019_apps_per_month.toPandas()
pd_sqft_2020_apps_per_month = sqft_2020_apps_per_month.toPandas()
sqft_2019_apps_per_month.show()
sqft_2020_apps_per_month.show()

+------------+------------------+
|intake_month|    avg(SWC_SQ_FT)|
+------------+------------------+
|          01| 284.6046511627907|
|          02|             277.0|
|          03|             242.6|
|          04|254.29090909090908|
|          05|271.51666666666665|
|          06|321.73333333333335|
|          07|          319.9375|
|          08| 289.7435897435897|
|          09|          247.8625|
|          10| 319.9047619047619|
|          11|262.48387096774195|
|          12| 248.2258064516129|
+------------+------------------+

+------------+------------------+
|intake_month|    avg(SWC_SQ_FT)|
+------------+------------------+
|          01| 249.3030303030303|
|          02|229.06896551724137|
|          03| 340.0769230769231|
|          04|             260.0|
|          05|             381.0|
|          06|             411.5|
+------------+------------------+



In [7]:
sqft_2019 = alt.Chart(pd_sqft_2019_apps_per_month).mark_line().encode(
    alt.X('intake_month'),
    alt.Y('avg(SWC_SQ_FT)',scale=alt.Scale(domain=(0, 450))),
).properties(title='Average Square Footage of Cafe Applications per Month in 2019')

sqft_2020 = alt.Chart(pd_sqft_2020_apps_per_month).mark_line().encode(
    alt.X('intake_month'),
    alt.Y('avg(SWC_SQ_FT)'),
).properties(title='Average Square Footage of Cafe Applications per Month in 2020')

alt.hconcat(
   sqft_2019,
   sqft_2020,
)

Since there are so few datpoints between March and June of 2020, the averages are skewed and it is unfair to compare them to the same months in 2019. 

In [8]:
sqft_apps_by_year = cafe.groupBy("intake_year").mean("SWC_SQ_FT")
sqft_apps_by_year.show()

+-----------+------------------+
|intake_year|    avg(SWC_SQ_FT)|
+-----------+------------------+
|       2020| 259.3333333333333|
|       2019|265.77358490566036|
+-----------+------------------+



In [9]:
new = 259.3333333333333
old = 265.77358490566036
pct_change = ((new - old) / (old)) * 100
print("The percent change is: %f and is therefore negligible." % pct_change)

The percent change is: -2.423210 and is therefore negligible.


In [10]:
spark.stop()