In [10]:
# This file will generate insightful plots by month
!pip install altair

from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import weekofyear
from pyspark.sql import functions as f
from pyspark.sql.functions import concat_ws
from pyspark.sql.types import StringType

spark = SparkSession.builder.appName("Agg_Hotness-Max").getOrCreate()

In [11]:
hotness = spark.read.format('csv').options(header='true', inferschema='true').load("NYC_Hotness.csv")
#hotness.show()
hotness.select(hotness["month_date_yyyymm"].cast(StringType()),"postal_code","zip_name","Area","hotness_score","supply_score","demand_score").show()

+-----------------+-----------+-------------------+----+-------------+------------+------------+
|month_date_yyyymm|postal_code|           zip_name|Area|hotness_score|supply_score|demand_score|
+-----------------+-----------+-------------------+----+-------------+------------+------------+
|           202102|      10307|  staten island, ny| NYC|    53.155427| 63.02905059| 43.28180265|
|           202102|      11105|        astoria, ny| NYC|    17.994675| 30.52895124| 5.460398204|
|           202102|      10457|          bronx, ny| NYC|     16.56599| 27.69939991| 5.432579581|
|           202102|      11429| queens village, ny| NYC|    38.540715| 44.16405039| 32.91737869|
|           202102|      11364|oakland gardens, ny| NYC|    27.506657| 52.66065255| 2.352660653|
|           202102|      11040|  new hyde park, ny| NYC|    34.958073| 58.37539244| 11.54075428|
|           202102|      10473|          bronx, ny| NYC|    10.187577| 13.97289671| 6.402257283|
|           202102|      10468

In [12]:
avg_hotness = hotness.groupBy('Area', 'month_date_yyyymm').mean('hotness_score','supply_score','demand_score').orderBy('month_date_yyyymm')
avg_hotness.show()

+-----+-----------------+------------------+------------------+------------------+
| Area|month_date_yyyymm|avg(hotness_score)| avg(supply_score)| avg(demand_score)|
+-----+-----------------+------------------+------------------+------------------+
|Metro|           201708| 51.20158204840765| 53.12881659246367|49.274347492642036|
|  NYC|           201708| 51.12845599421963|52.079069812525965| 50.17784213138727|
|  NYC|           201709| 51.27350682080924| 53.19612019471672|49.350893398589584|
|Metro|           201709| 51.65980989936303|52.465291113184804|50.854328656802444|
|Metro|           201710| 53.25264441783435| 55.16319186329174| 51.34209697494271|
|  NYC|           201710| 52.79212968208094| 58.18168083513298|47.402578525682095|
|  NYC|           201711| 54.38690575722545|  61.9055129951272|46.868298509710975|
|Metro|           201711| 53.97210641528668|  56.2131140334447| 51.73109881603564|
|  NYC|           201712| 53.83396394797687| 60.54651406894802| 47.12141379257225|
|Met

In [18]:
avg_hotness_nyc = avg_hotness.where(avg_hotness["Area"] == "NYC").withColumn("year", f.substring("month_date_yyyymm", 0, 4)).withColumn("month", f.substring("month_date_yyyymm", 5, 2))
avg_hotness_nyc.show()
avg_hotness_nyc = avg_hotness_nyc.select('month','year','avg(hotness_score)','avg(supply_score)','avg(demand_score)')
avg_hotness_nyc_2020 = avg_hotness_nyc.where(avg_hotness_nyc['year'] == '2020')
avg_hotness_nyc_2019 = avg_hotness_nyc.where(avg_hotness_nyc['year'] == '2019')
avg_hotness_nyc = avg_hotness_nyc.toPandas()

avg_hotness_metro = avg_hotness.where(avg_hotness["Area"] == "Metro").withColumn("year", f.substring("month_date_yyyymm", 0, 4)).withColumn("month", f.substring("month_date_yyyymm", 5, 2))
avg_hotness_metro.show()
avg_hotness_metro = avg_hotness_metro.select('month','year','avg(hotness_score)','avg(supply_score)','avg(demand_score)')
avg_hotness_metro_2020 = avg_hotness_metro.where(avg_hotness_metro['year'] == '2020')
avg_hotness_metro_2019 = avg_hotness_metro.where(avg_hotness_metro['year'] == '2019')
avg_hotness_metro = avg_hotness_metro.toPandas()


+----+-----------------+------------------+------------------+------------------+----+-----+
|Area|month_date_yyyymm|avg(hotness_score)| avg(supply_score)| avg(demand_score)|year|month|
+----+-----------------+------------------+------------------+------------------+----+-----+
| NYC|           201708| 51.12845599421963|52.079069812525965| 50.17784213138727|2017|   08|
| NYC|           201709| 51.27350682080924| 53.19612019471672|49.350893398589584|2017|   09|
| NYC|           201710| 52.79212968208094| 58.18168083513298|47.402578525682095|2017|   10|
| NYC|           201711| 54.38690575722545|  61.9055129951272|46.868298509710975|2017|   11|
| NYC|           201712| 53.83396394797687| 60.54651406894802| 47.12141379257225|2017|   12|
| NYC|           201801|49.682897040462414| 57.59762171144508|41.768172372670534|2018|   01|
| NYC|           201802| 46.65718349132948| 57.18678042383236|36.127586604647405|2018|   02|
| NYC|           201803|45.185328369942226| 55.36405794601158|35.00659

In [19]:
# Generate data files to be used to plot realtor data against AQI
avg_hotness_nyc_2020.coalesce(1).write.option("header", True).csv("avg_hotness_nyc_2020.csv")
avg_hotness_nyc_2019.coalesce(1).write.option("header", True).csv("avg_hotness_nyc_2019.csv")
avg_hotness_metro_2020.coalesce(1).write.option("header", True).csv("avg_hotness_metro_2020.csv")
avg_hotness_metro_2020.coalesce(1).write.option("header", True).csv("avg_hotness_metro_2019.csv")

In [20]:
import altair as alt
import pandas as pd

In [21]:
nyc = alt.Chart(avg_hotness_nyc).mark_line().encode(
    x='month',
    y='avg(hotness_score)',
    color='year',
    strokeDash='year',
).properties(title='NYC Hotness')

metro = alt.Chart(avg_hotness_metro).mark_line().encode(
    x='month',
    y='avg(hotness_score)',
    color='year',
    strokeDash='year',
).properties(title='NYC Metro Area Hotness')

alt.hconcat(
   nyc,
   metro,
)

In [16]:
sup_nyc = alt.Chart(avg_hotness_nyc).mark_line().encode(
    x='month',
    y='avg(supply_score)',
    color='year',
    strokeDash='year',
).properties(title='NYC Supply Score')

dem_nyc = alt.Chart(avg_hotness_nyc).mark_line().encode(
    x='month',
    y='avg(demand_score)',
    color='year',
    strokeDash='year',
).properties(title='NYC Demand Score')

alt.hconcat(
   sup_nyc,
   dem_nyc,
)

In [17]:
sup_metro = alt.Chart(avg_hotness_metro).mark_line().encode(
    x='month',
    y='avg(supply_score)',
    color='year',
    strokeDash='year',
).properties(title='Metro Supply Score')

dem_metro = alt.Chart(avg_hotness_metro).mark_line().encode(
    x='month',
    y='avg(demand_score)',
    color='year',
    strokeDash='year',
).properties(title='Metro Demand Score')

alt.hconcat(
   sup_metro,
   dem_metro,
)

In [25]:
print(avg_hotness_nyc)

   month  year  avg(hotness_score)  avg(supply_score)  avg(demand_score)
0     08  2017           51.128456          52.079070          50.177842
1     09  2017           51.273507          53.196120          49.350893
2     10  2017           52.792130          58.181681          47.402579
3     11  2017           54.386906          61.905513          46.868299
4     12  2017           53.833964          60.546514          47.121414
5     01  2018           49.682897          57.597622          41.768172
6     02  2018           46.657183          57.186780          36.127587
7     03  2018           45.185328          55.364058          35.006599
8     04  2018           42.819265          53.851781          31.786750
9     05  2018           41.818101          51.984897          31.651304
10    06  2018           39.505206          48.840333          30.170079
11    07  2018           38.253999          46.880666          29.627332
12    08  2018           38.101489          45.7418

In [9]:
spark.stop()