In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,substring,desc,lag
from pyspark.sql.window import Window

In [35]:
spark = SparkSession.builder.master('local').appName('Growth of Airbnb').getOrCreate()

In [36]:
df_input = spark.read.options(header='True', InferSchema='True').csv('airbnb_search_details.csv')
#df_input.show(5)

In [42]:
##creating column period with year format and taking count for current year.
df_monthly_count = df_input.withColumn("period", substring('host_since', 1,4))
df_monthly_count = df_monthly_count.groupBy('period').count()
df_monthly_count = df_monthly_count.withColumnRenamed("count", "current_year_count").orderBy('period')
#df_monthly_count.show()

In [43]:
##taking count for previous year
windowSpec  = Window.orderBy("period")
df_monthly_count = df_monthly_count.withColumn("previous_year_count",lag("current_year_count",1).over(windowSpec))
#df_monthly_count.show()

In [49]:
##finding estimated growth
df_output =  df_monthly_count.withColumn("growth_percentage", ((col('current_year_count') - col('previous_year_count'))/ col('previous_year_count')) *100)
df_output = df_output.na.fill(value=0)
df_output.show()

+------+------------------+-------------------+-------------------+
|period|current_year_count|previous_year_count|  growth_percentage|
+------+------------------+-------------------+-------------------+
|  2009|                 2|                  0|                0.0|
|  2010|                 4|                  2|              100.0|
|  2011|                 9|                  4|              125.0|
|  2012|                10|                  9|  11.11111111111111|
|  2013|                30|                 10|              200.0|
|  2014|                33|                 30|               10.0|
|  2015|                33|                 33|                0.0|
|  2016|                28|                 33|-15.151515151515152|
|  2017|                11|                 28| -60.71428571428571|
+------+------------------+-------------------+-------------------+

