In [1]:
# This file will generate insightful plots for listing data by month
!pip -q install pyspark
!pip -q install altair
!pip -q install pandas

from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import weekofyear
from pyspark.sql import functions as f
from pyspark.sql.functions import concat_ws
from pyspark.sql.types import StringType
import altair as alt
import pandas as pd

spark = SparkSession.builder.appName("Agg_Listings-Max").getOrCreate()

In [2]:
hotness = spark.read.format('csv').options(header='true', inferschema='true').load("NYC_Listings.csv")
hotness.select(hotness["month_date_yyyymm"].cast(StringType()),"postal_code","zip_name","Area","median_listing_price","active_listing_count","median_days_on_market","new_listing_count","total_listing_count").show()

+-----------------+-----------+-----------------+----+--------------------+--------------------+---------------------+-----------------+-------------------+
|month_date_yyyymm|postal_code|         zip_name|Area|median_listing_price|active_listing_count|median_days_on_market|new_listing_count|total_listing_count|
+-----------------+-----------+-----------------+----+--------------------+--------------------+---------------------+-----------------+-------------------+
|           202102|      10467|        bronx, ny| NYC|            214000.0|                  78|                157.5|               16|                 91|
|           202102|      10310|staten island, ny| NYC|            574499.5|                  47|                 44.0|               20|                 68|
|           202102|      10303|staten island, ny| NYC|            444450.0|                  56|                 65.0|               20|                 85|
|           202102|      10314|staten island, ny| NYC|    

In [3]:
avg_hotness = hotness.groupBy('Area', 'month_date_yyyymm').agg(f.avg("median_days_on_market").alias("avg_days_on_market"),
f.avg("median_listing_price").alias("avg_listing_price"),
f.sum("active_listing_count").alias("sum_active_listing_count"),
f.sum("new_listing_count").alias("sum_new_listing_count"),
f.sum("total_listing_count").alias("sum_total_listing_count")).orderBy('month_date_yyyymm')
avg_hotness.show()

+-----+-----------------+------------------+------------------+------------------------+---------------------+-----------------------+
| Area|month_date_yyyymm|avg_days_on_market| avg_listing_price|sum_active_listing_count|sum_new_listing_count|sum_total_listing_count|
+-----+-----------------+------------------+------------------+------------------------+---------------------+-----------------------+
|Metro|           201607| 77.83412322274881| 551566.1208530805|                   94568|                20624|                 102763|
|  NYC|           201607|  69.2012987012987| 973732.7207792208|                   12368|                 2700|                  17443|
|  NYC|           201608|  78.5576923076923|  965631.467948718|                   11986|                 2292|                  16966|
|Metro|           201608|  85.0574644549763| 549976.2132701422|                   91865|                17756|                  99651|
|  NYC|           201609| 81.05900621118012| 948859.860

In [4]:
avg_hotness_nyc = avg_hotness.where(avg_hotness["Area"] == "NYC").withColumn("year", f.substring("month_date_yyyymm", 0, 4)).withColumn("month", f.substring("month_date_yyyymm", 5, 2))
avg_hotness_nyc.show()
avg_hotness_nyc = avg_hotness_nyc.select('month','year','avg_listing_price','sum_active_listing_count','avg_days_on_market','sum_new_listing_count','sum_total_listing_count').toPandas()

avg_hotness_metro = avg_hotness.where(avg_hotness["Area"] == "Metro").withColumn("year", f.substring("month_date_yyyymm", 0, 4)).withColumn("month", f.substring("month_date_yyyymm", 5, 2))
avg_hotness_metro.show()
avg_hotness_metro = avg_hotness_metro.select('month','year','avg_listing_price','sum_active_listing_count','avg_days_on_market','sum_new_listing_count','sum_total_listing_count').toPandas()


+----+-----------------+------------------+------------------+------------------------+---------------------+-----------------------+----+-----+
|Area|month_date_yyyymm|avg_days_on_market| avg_listing_price|sum_active_listing_count|sum_new_listing_count|sum_total_listing_count|year|month|
+----+-----------------+------------------+------------------+------------------------+---------------------+-----------------------+----+-----+
| NYC|           201607|  69.2012987012987| 973732.7207792208|                   12368|                 2700|                  17443|2016|   07|
| NYC|           201608|  78.5576923076923|  965631.467948718|                   11986|                 2292|                  16966|2016|   08|
| NYC|           201609| 81.05900621118012| 948859.8602484472|                   12370|                 3556|                  17356|2016|   09|
| NYC|           201610| 77.16564417177914| 936934.2515337423|                   13007|                 3260|                  180

In [5]:
nyc = alt.Chart(avg_hotness_nyc).mark_line().encode(
    x='month',
    y='avg_listing_price',
    color='year',
    strokeDash='year',
).properties(title='NYC: Avg Listing Price')

metro = alt.Chart(avg_hotness_metro).mark_line().encode(
    x='month',
    y='avg_listing_price',
    color='year',
    strokeDash='year',
).properties(title='NYC Metro Area: Avg Listing Price')

alt.hconcat(
   nyc,
   metro,
)

In [6]:
sup_nyc = alt.Chart(avg_hotness_nyc).mark_line().encode(
    x='month',
    y='sum_active_listing_count',
    color='year',
    strokeDash='year',
).properties(title='NYC: Active Listing Count')

dem_nyc = alt.Chart(avg_hotness_metro).mark_line().encode(
    x='month',
    y='sum_active_listing_count',
    color='year',
    strokeDash='year',
).properties(title='NYC Metro Area: Active Listing Count')

alt.hconcat(
   sup_nyc,
   dem_nyc,
)

In [7]:
sup_nyc = alt.Chart(avg_hotness_nyc).mark_line().encode(
    x='month',
    y='sum_new_listing_count',
    color='year',
    strokeDash='year',
).properties(title='NYC: New Listing Count')

dem_nyc = alt.Chart(avg_hotness_metro).mark_line().encode(
    x='month',
    y='sum_new_listing_count',
    color='year',
    strokeDash='year',
).properties(title='NYC Metro Area: New Listing Count')

alt.hconcat(
   sup_nyc,
   dem_nyc,
)

In [8]:
sup_nyc = alt.Chart(avg_hotness_nyc).mark_line().encode(
    x='month',
    y='sum_total_listing_count',
    color='year',
    strokeDash='year',
).properties(title='NYC: Total Listing Count')

dem_nyc = alt.Chart(avg_hotness_metro).mark_line().encode(
    x='month',
    y='sum_total_listing_count',
    color='year',
    strokeDash='year',
).properties(title='NYC Metro Area: Total Listing Count')

alt.hconcat(
   sup_nyc,
   dem_nyc,
)

In [9]:
sup_nyc = alt.Chart(avg_hotness_nyc).mark_line().encode(
    x='month',
    y='avg_days_on_market',
    color='year',
    strokeDash='year',
).properties(title='NYC: Avg Days on Market')

dem_nyc = alt.Chart(avg_hotness_metro).mark_line().encode(
    x='month',
    y='avg_days_on_market',
    color='year',
    strokeDash='year',
).properties(title='NYC Metro Area: Avg Days on Market')

alt.hconcat(
   sup_nyc,
   dem_nyc,
)

In [None]:
spark.stop()