<a href="https://colab.research.google.com/github/crneubert/best-music/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SETUP**


In [None]:
!pip install pyspark
!pip install -U -q PyDrive

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["PATH"] += ":/usr/lib/jvm/java-17-openjdk-amd64/bin"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for PyDrive (setup.py) ... [?25l[?25hdone


In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import types as sparktypes

# import PySpark aggregate functions with underscores to avoid collision with Python sum, etc
from pyspark.sql.functions import sum as _sum, avg as _avg, count as _count
from pyspark.sql.functions import col, lit, round, month, to_date, when, expr

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, dense_rank, row_number, lag, lead

In [None]:
!wget -q https://media.githubusercontent.com/media/crneubert/best-music/refs/heads/main/data/calendar.csv
!wget -q https://media.githubusercontent.com/media/crneubert/best-music/refs/heads/main/data/listings.csv
!wget -q https://media.githubusercontent.com/media/crneubert/best-music/refs/heads/main/data/reviews.csv


In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)



In [None]:
listings = sqlContext.read.csv("listings.csv", header = True)
reviews = sqlContext.read.csv("reviews.csv", header = True)
calendar = sqlContext.read.csv("calendar.csv", header = True)

# **ANALYSIS**

In [None]:
listings.show()

+-----+--------------------+---------+---------+--------------------+--------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
|   id|                name|  host_id|host_name| neighbourhood_group| neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|
+-----+--------------------+---------+---------+--------------------+--------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
| 6369|Rooftop terrace r...|    13660|    Simon|           Chamartín|Hispanoamérica|40.45724| -3.67688|   Private room|   60|             1|               78| 2020-09-20|             0.58|                             1|             180|
|21853|Bright and airy room|    83531|    Abdel|    

In [None]:
reviews.show()

+----------+----------+
|listing_id|      date|
+----------+----------+
|      6369|2010-03-14|
|      6369|2010-03-23|
|      6369|2010-04-10|
|      6369|2010-04-21|
|      6369|2010-04-26|
|      6369|2010-05-10|
|      6369|2010-05-15|
|      6369|2010-05-23|
|      6369|2010-05-24|
|      6369|2010-06-25|
|      6369|2010-06-28|
|      6369|2010-09-09|
|      6369|2010-09-12|
|      6369|2010-09-15|
|      6369|2010-09-27|
|      6369|2010-10-10|
|      6369|2010-10-14|
|      6369|2010-11-02|
|      6369|2010-11-14|
|      6369|2010-11-28|
+----------+----------+
only showing top 20 rows



In [None]:
calendar.show()

+----------+----------+---------+------+--------------+--------------+--------------+
|listing_id|      date|available| price|adjusted_price|minimum_nights|maximum_nights|
+----------+----------+---------+------+--------------+--------------+--------------+
|    167183|2021-04-15|        f|$45.00|        $45.00|             1|             5|
|      6369|2021-04-15|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-16|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-17|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-18|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-19|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-20|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-21|        t|$60.00|        $60.00|             1|          1125|
|      6369|2021-04-22|        t|$60.00|        $60.00

In [None]:
listings_clean = (listings.withColumnRenamed("id", "listing_id")
                         .filter(col("room_type").isin("Shared room", "Private room", "Entire home/apt", "Hotel room")))



calendar_clean = (calendar.withColumn("available_boolean", when(col("available") == "t", 0).otherwise(1))
                          .groupBy("listing_id")
                          .agg(_avg("available_boolean").alias("occupancy_rate")))

combo_pizza = (listings_clean.join(calendar_clean, on="listing_id")
                             .withColumn("number_of_reviews", col("number_of_reviews").cast("int"))
                             .withColumn("minimum_nights", col("minimum_nights").cast("int"))
                             .withColumn("price", col("price").cast("int"))
                             .withColumn("reviews_per_month", col("reviews_per_month").cast("int")))

combo_pizza.show()


+----------+--------------------+---------+--------------------+--------------------+---------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+--------------------+
|listing_id|                name|  host_id|           host_name| neighbourhood_group|  neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|      occupancy_rate|
+----------+--------------------+---------+--------------------+--------------------+---------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+--------------------+
|    167183|private house B &...|   796746|            Consuelo|           Hortaleza|        Piovera|40.45575| -3.64912|   Private room|   45|             1|               23|

Room Type

In [None]:
room_type = (combo_pizza.groupBy("room_type")
                        .agg(_avg("occupancy_rate").alias("avg_occupancy_rate"))
                        .orderBy("avg_occupancy_rate", ascending = False))
room_type.show()

+---------------+------------------+
|      room_type|avg_occupancy_rate|
+---------------+------------------+
|   Private room| 0.555126992439271|
|Entire home/apt| 0.529229739202593|
|    Shared room|0.5122921401780076|
|     Hotel room|0.3468134414697336|
+---------------+------------------+



**Is there a correlation between Occupancy Rate and Listing Prices?**

In [None]:
print(combo_pizza.corr("occupancy_rate", "price"))


0.016470467622479442
-0.03716518313219832


**Correlation between Occupancy Rate and Reviews Per Month?**

In [None]:
print(combo_pizza.corr("occupancy_rate", "reviews_per_month"))

**Have a look at Occupancy Rate and Neighborhood/Neighborhood Groups?**

In [None]:
neighborhood = (combo_pizza.groupBy("neighbourhood")
                           .agg(_avg("occupancy_rate").alias("avg_occupancy_rate"),
                                _count("*").alias("listing_per_neighborhood"))
                           .orderBy("listing_per_neighborhood", ascending = False)
                           .filter(col("listing_per_neighborhood") > 200))
neighborhood.show()

+----------------+-------------------+------------------------+
|   neighbourhood| avg_occupancy_rate|listing_per_neighborhood|
+----------------+-------------------+------------------------+
|     Embajadores|  0.571808401452593|                    2311|
|     Universidad| 0.5327504496648052|                    1867|
|         Palacio| 0.5442147728034978|                    1499|
|             Sol| 0.5236208206826426|                    1120|
|        Justicia| 0.5319452300725414|                     948|
|          Cortes| 0.4763859742308417|                     880|
|       Trafalgar| 0.5256238632524282|                     370|
| Palos de Moguer| 0.5667180884512449|                     337|
|            Goya| 0.5623816214597712|                     296|
|       Argüelles| 0.5772685215982591|                     280|
|       Recoletos| 0.5443681588125344|                     274|
|Puerta del Angel| 0.5729166096960768|                     271|
|      Guindalera| 0.5424394773107185|  

In [None]:
neighborhood_group = (combo_pizza.groupBy("neighbourhood_group")
                           .agg(_avg("occupancy_rate").alias("avg_occupancy_rate"),
                                _count("*").alias("listing_per_group"))
                           .orderBy("listing_per_group", ascending = False)
                           .filter(col("listing_per_group") > 100))
neighborhood_group.show()

#Centro looks like it has soooo many more than the others - could be good to look into Centro

+--------------------+-------------------+-----------------+
| neighbourhood_group| avg_occupancy_rate|listing_per_group|
+--------------------+-------------------+-----------------+
|              Centro| 0.5381833396301434|             8625|
|           Salamanca|  0.545655321094543|             1324|
|            Chamberí| 0.5405170286927271|             1248|
|          Arganzuela| 0.6066468780809275|             1102|
|              Tetuán| 0.5414233896046414|              810|
|         Carabanchel| 0.5011714950887431|              707|
|              Retiro| 0.5468042183449434|              662|
|       Ciudad Lineal| 0.5548000453518356|              649|
|  Puente de Vallecas|0.43851322767191103|              614|
|              Latina| 0.5463370408657556|              605|
|           Chamartín| 0.5152878845138193|              577|
|   Moncloa - Aravaca| 0.5541672944692754|              553|
|San Blas - Canill...|0.48199241969413037|              490|
|           Hortaleza| 0

In [None]:
centro = (combo_pizza.filter(col("neighbourhood_group") == "Centro")
                     .groupBy("neighbourhood") #Do I also need to group by Listing ID or was that done when globalizing occ rate?
                     .agg(round(_avg("price"), 2).alias("Average Price Per Neighborhood"),
                          _count("*").alias("Listings Per Neighborhood"))
)
centro.show() #Check out https://www.kaggle.com/datasets/kanchana1990/madrid-idealista-property-listings for combining with price and making metrics!

+-------------+------------------------------+-------------------------+
|neighbourhood|Average Price Per Neighborhood|Listings Per Neighborhood|
+-------------+------------------------------+-------------------------+
|  Universidad|                        108.45|                     1867|
|          Sol|                         128.2|                     1120|
|      Palacio|                        101.17|                     1499|
|     Justicia|                        108.44|                      948|
|       Cortes|                        196.07|                      880|
|  Embajadores|                        139.53|                     2311|
+-------------+------------------------------+-------------------------+



In [20]:
import glob
import kagglehub

local_dir = kagglehub.dataset_download("kanchana1990/madrid-idealista-property-listings")
csv_path = os.path.join(local_dir, "idealista_madrid.csv")

df = (spark.read.format("csv")
      .option("header", "true")
      .option("inferSchema", "true")
      .csv(csv_path))

df.show()

Downloading from https://www.kaggle.com/api/v1/datasets/download/kanchana1990/madrid-idealista-property-listings?dataset_version_number=1...


100%|██████████| 580k/580k [00:00<00:00, 85.9MB/s]

Extracting files...





+--------------------+--------------------+--------------------+---------+-------+-----+-----+----+--------------------+--------------------+--------------+--------------------------+--------------------+
|                 url|          listingUrl|               title|       id|  price|baths|rooms|sqft|         description|             address|      typology|advertiserProfessionalName|      advertiserName|
+--------------------+--------------------+--------------------+---------+-------+-----+-----+----+--------------------+--------------------+--------------+--------------------------+--------------------+
|https://www.ideal...|https://www.ideal...|Piso en venta en ...|104027174|1920000|    3|    3| 183|Residencia única ...|   Recoletos, Madrid|         Pisos|           Promora Madrid |      Promora Madrid|
|https://www.ideal...|https://www.ideal...|Piso en venta en ...|102321942|1995000|    3|    3| 170|Preciosa reforma ...|  Castellana, Madrid|         Pisos|               Madrid MM

**What time is best to have AirBnb avaliable to rent?**

In [None]:
best_time = (calendar.withColumn("available_boolean", when(col("available") == "t", 0).otherwise(1))
                     .withColumn("date", to_date(col("date"), "yyyy-MM-dd"))
                     .withColumn("month", month(col("date")))
                     .groupBy("month")
                     .agg(round(_avg("available_boolean"),2).alias("month_occ_rate"),
                          _count("*").alias("listing_count")) #adding listing count check just in case there was some imbalance - not sure why half are identical
                     .orderBy("month", ascending = False)
)
best_time.show()

+-----+--------------+-------------+
|month|month_occ_rate|listing_count|
+-----+--------------+-------------+
|   12|          0.58|       608127|
|   11|          0.58|       588510|
|   10|          0.54|       608127|
|    9|          0.48|       588510|
|    8|          0.48|       608127|
|    7|          0.45|       608127|
|    6|          0.44|       588510|
|    5|           0.5|       608127|
|    4|          0.61|       589561|
|    3|           0.6|       608127|
|    2|           0.6|       549276|
|    1|          0.59|       608127|
+-----+--------------+-------------+



In [None]:
listings.groupBy("room_type").count().show()

+-----------+-----+
|  room_type|count|
+-----------+-----+
|         15|    2|
|         42|    1|
|Shared room|  328|
|         59|    1|
|   -3.70739|    1|
|         22|    3|
|         35|    2|
|         16|    2|
|        410|    1|
|       NULL|   58|
|         43|    1|
|        100|    1|
|         18|    1|
|         61|    1|
| Hotel room|  166|
|   40.42756|    1|
|         78|    1|
|       1200|    1|
|         90|    2|
|         19|    1|
+-----------+-----+
only showing top 20 rows

