<a href="https://colab.research.google.com/github/crneubert/best-music/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SETUP**


In [1]:
!pip install pyspark
!pip install -U -q PyDrive

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["PATH"] += ":/usr/lib/jvm/java-17-openjdk-amd64/bin"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for PyDrive (setup.py) ... [?25l[?25hdone


In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import types as sparktypes

# import PySpark aggregate functions with underscores to avoid collision with Python sum, etc
from pyspark.sql.functions import sum as _sum, avg as _avg, count as _count
from pyspark.sql.functions import col, lit, round, month, to_date, when, expr

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, dense_rank, row_number, lag, lead

In [3]:
!wget -q https://media.githubusercontent.com/media/crneubert/best-music/refs/heads/main/data/calendar.csv
!wget -q https://media.githubusercontent.com/media/crneubert/best-music/refs/heads/main/data/listings.csv
!wget -q https://media.githubusercontent.com/media/crneubert/best-music/refs/heads/main/data/reviews.csv

In [4]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)



In [5]:
listings = sqlContext.read.csv("listings.csv", header = True)
reviews = sqlContext.read.csv("reviews.csv", header = True)
calendar = sqlContext.read.csv("calendar.csv", header = True)

# **ANALYSIS**

In [6]:
listings.show()

+-----+--------------------+---------+---------+--------------------+--------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
|   id|                name|  host_id|host_name| neighbourhood_group| neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|
+-----+--------------------+---------+---------+--------------------+--------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
| 6369|Rooftop terrace r...|    13660|    Simon|           Chamartín|Hispanoamérica|40.45724| -3.67688|   Private room|   60|             1|               78| 2020-09-20|             0.58|                             1|             180|
|21853|Bright and airy room|    83531|    Abdel|    

In [7]:
reviews.show()

+----------+----------+
|listing_id|      date|
+----------+----------+
|      6369|2010-03-14|
|      6369|2010-03-23|
|      6369|2010-04-10|
|      6369|2010-04-21|
|      6369|2010-04-26|
|      6369|2010-05-10|
|      6369|2010-05-15|
|      6369|2010-05-23|
|      6369|2010-05-24|
|      6369|2010-06-25|
|      6369|2010-06-28|
|      6369|2010-09-09|
|      6369|2010-09-12|
|      6369|2010-09-15|
|      6369|2010-09-27|
|      6369|2010-10-10|
|      6369|2010-10-14|
|      6369|2010-11-02|
|      6369|2010-11-14|
|      6369|2010-11-28|
+----------+----------+
only showing top 20 rows



In [8]:
calendar.show()

+----------+----------+---------+------+--------------+--------------+--------------+
|listing_id|      date|available| price|adjusted_price|minimum_nights|maximum_nights|
+----------+----------+---------+------+--------------+--------------+--------------+
|    167183|2021-04-15|        f|$45.00|        $45.00|             1|             5|
|      6369|2021-04-15|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-16|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-17|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-18|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-19|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-20|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-21|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-22|        t|$60.00|        $60.00

In [13]:
listings_clean = (listings.withColumnRenamed("id", "listing_id")
                         .filter(col("room_type").isin("Shared room", "Private room", "Entire home/apt", "Hotel room")))



calendar_clean = (calendar.withColumn("available_boolean", when(col("available") == "t", 0).otherwise(1))
                          .groupBy("listing_id")
                          .agg(_avg("available_boolean").alias("occupancy_rate")))

combo_pizza = (listings_clean.join(calendar_clean, on="listing_id")
                             .withColumn("number_of_reviews", col("number_of_reviews").cast("int"))
                             .withColumn("minimum_nights", col("minimum_nights").cast("int"))
                             .withColumn("price", col("price").cast("int"))
                             .withColumn("reviews_per_month", col("reviews_per_month").cast("int")))

combo_pizza.show()


+----------+--------------------+---------+--------------------+--------------------+---------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+--------------------+
|listing_id|                name|  host_id|           host_name| neighbourhood_group|  neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|      occupancy_rate|
+----------+--------------------+---------+--------------------+--------------------+---------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+--------------------+
|    167183|private house B &...|   796746|            Consuelo|           Hortaleza|        Piovera|40.45575| -3.64912|   Private room|   45|             1|               23|

Room Type

In [10]:
room_type = (combo_pizza.groupBy("room_type")
                        .agg(_avg("occupancy_rate").alias("avg_occupancy_rate"))
                        .orderBy("avg_occupancy_rate", ascending = False))
room_type.show()

+---------------+------------------+
|      room_type|avg_occupancy_rate|
+---------------+------------------+
|   Private room| 0.555126992439271|
|Entire home/apt| 0.529229739202593|
|    Shared room|0.5122921401780076|
|     Hotel room|0.3468134414697336|
+---------------+------------------+



Next Part

In [15]:
print(combo_pizza.corr("occupancy_rate", "price"))
print(combo_pizza.corr("occupancy_rate", "reviews_per_month"))

0.016470467622479442
-0.03716518313219832


In [18]:
neighborhood = (combo_pizza.groupBy("neighbourhood")
                           .agg(_avg("occupancy_rate").alias("avg_occupancy_rate"))
                           .orderBy("avg_occupancy_rate", ascending = False))
neighborhood.show()

+-----------------+------------------+
|    neighbourhood|avg_occupancy_rate|
+-----------------+------------------+
|       Valdemarín|0.8005479452054794|
|          Pavones| 0.789041095890411|
|         El Pardo|0.7547845397609602|
|          Atalaya|0.7095890410958904|
|      Media Legua| 0.684236906455074|
|          Acacias| 0.680184409717177|
|        El Goloso|0.6726940639269408|
|         Estrella|0.6471569728273074|
|         Delicias|0.6424726632002626|
|      Castillejos|0.6308780597350102|
|            Timón|0.6230322730438821|
|           Ventas|0.6197088401437404|
|          Chopera|0.6168632005905188|
| Alameda de Osuna|0.6132420091324198|
|San Juan Bautista|0.6119004510068119|
|      Los Angeles| 0.608573722856229|
|           Lucero|0.6072408366703085|
|          Legazpi|0.6058346017250126|
|        Fontarrón|0.6047322540473226|
|         Cármenes|0.6035481697731866|
+-----------------+------------------+
only showing top 20 rows



In [19]:
neighborhood_group = (combo_pizza.groupBy("neighbourhood_group")
                           .agg(_avg("occupancy_rate").alias("avg_occupancy_rate"))
                           .orderBy("avg_occupancy_rate", ascending = False))
neighborhood_group.show()

+--------------------+-------------------+
| neighbourhood_group| avg_occupancy_rate|
+--------------------+-------------------+
|           Moratalaz| 0.6149968725457734|
|          Arganzuela| 0.6066468780809275|
|          Villaverde| 0.5807613564227848|
|             Barajas| 0.5736790606653622|
|       Ciudad Lineal| 0.5548000453518356|
|   Moncloa - Aravaca| 0.5541672944692754|
|              Retiro| 0.5468042183449434|
|              Latina| 0.5463370408657556|
|           Salamanca|  0.545655321094543|
|              Tetuán| 0.5414233896046414|
|            Chamberí| 0.5405170286927271|
|              Centro| 0.5381833396301434|
|Fuencarral - El P...| 0.5360639902802491|
|           Hortaleza| 0.5182060659603888|
|           Chamartín| 0.5152878845138193|
|               Usera| 0.5081418626287258|
|         Carabanchel| 0.5011714950887431|
|   Villa de Vallecas| 0.4999107076021516|
|San Blas - Canill...|0.48199241969413037|
|           Vicálvaro|0.46943675677841284|
+----------

What time is best to have AirBnb avaliable to rent?

In [20]:
best_time = (calendar.withColumn("available_boolean", when(col("available") == "t", 0).otherwise(1))
                     .withColumn("date", to_date(col("date"), "yyyy-MM-dd"))
                     .withColumn("month", month(col("date")))
                     .groupBy("month")
                     .agg(_avg("available_boolean").alias("month_occ_rate"))
                     .orderBy("month_occ_rate", ascending = False))
best_time.show()

+-----+-------------------+
|month|     month_occ_rate|
+-----+-------------------+
|    4| 0.6066581744721921|
|    3| 0.6048736530362901|
|    2| 0.6044101690224951|
|    1| 0.5929353572526791|
|   12| 0.5762184543689065|
|   11| 0.5760377903519056|
|   10| 0.5415431316156001|
|    5|0.49732868298891514|
|    9|0.48412261473891693|
|    8|0.47935875236587094|
|    7| 0.4533674709394584|
|    6| 0.4395830147321201|
+-----+-------------------+



In [12]:
listings.groupBy("room_type").count().collect()

[Row(room_type='15', count=2),
 Row(room_type='42', count=1),
 Row(room_type='Shared room', count=328),
 Row(room_type='59', count=1),
 Row(room_type='-3.70739', count=1),
 Row(room_type='22', count=3),
 Row(room_type='35', count=2),
 Row(room_type='16', count=2),
 Row(room_type='410', count=1),
 Row(room_type=None, count=58),
 Row(room_type='43', count=1),
 Row(room_type='100', count=1),
 Row(room_type='18', count=1),
 Row(room_type='61', count=1),
 Row(room_type='Hotel room', count=166),
 Row(room_type='40.42756', count=1),
 Row(room_type='78', count=1),
 Row(room_type='1200', count=1),
 Row(room_type='90', count=2),
 Row(room_type='19', count=1),
 Row(room_type='23', count=1),
 Row(room_type='55', count=1),
 Row(room_type='Entire home/apt', count=11286),
 Row(room_type='-3.67316', count=1),
 Row(room_type='38', count=1),
 Row(room_type='40', count=2),
 Row(room_type='376', count=1),
 Row(room_type='40.40902', count=1),
 Row(room_type='115', count=1),
 Row(room_type='-3.70747', count