In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read Inside Airbnb data") \
    .getOrCreate()

25/08/25 15:24:55 WARN Utils: Your hostname, ubuntu-de resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/08/25 15:24:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/25 15:24:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
listings = spark.read.csv("data/listings.csv", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)
print("Rows:", listings.count())

[Stage 2:>                                                          (0 + 1) / 1]

Rows: 96651


                                                                                

In [3]:
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

In [4]:
# 1. Get a non-null picture URL for any property ("picture_url" field)
# Select any non-null picture URL
from pyspark.sql.functions import col

non_null_pictures = listings.filter(col("picture_url").isNotNull()) \
                            .select("picture_url")

non_null_pictures.show(10, truncate=False)

+----------------------------------------------------------------------------------------------------------+
|picture_url                                                                                               |
+----------------------------------------------------------------------------------------------------------+
|https://a0.muscache.com/pictures/hosting/Hosting-264776/original/3cc7b93f-dbda-4ded-ac15-e9d96691e7ca.jpeg|
|https://a0.muscache.com/pictures/hosting/Hosting-264777/original/228dd7a0-3db7-48ce-9e1d-55e10911b2b4.jpeg|
|https://a0.muscache.com/pictures/50662093/af12eefc_original.jpg                                           |
|https://a0.muscache.com/pictures/50660860/e4406bea_original.jpg                                           |
|https://a0.muscache.com/pictures/airflow/Hosting-264780/original/7485e09c-4ff7-47c2-adb8-02de476c547c.jpg |
|https://a0.muscache.com/pictures/29250432/607b3eec_original.jpg                                           |
|https://a0.muscach

In [16]:
# 2. Get number of properties that get more than 10 reviews per month
num_props = listings.filter(col("reviews_per_month") > 10).count()
print("Number of properties with > 10 reviews/month:", num_props)

[Stage 13:>                                                         (0 + 1) / 1]

Number of properties with > 10 reviews/month: 57


                                                                                

In [17]:
# 3. Get properties that have more bathrooms than bedrooms
more_bath_than_bed = listings.filter(col("bathrooms") > col("bedrooms"))

print("Properties with more bathrooms than bedrooms:", more_bath_than_bed.count())
more_bath_than_bed.select("id", "name", "bathrooms", "bedrooms").show(10, truncate=False)

[Stage 16:>                                                         (0 + 1) / 1]

Properties with more bathrooms than bedrooms: 9143
+------+--------------------------------------------------+---------+--------+
|id    |name                                              |bathrooms|bedrooms|
+------+--------------------------------------------------+---------+--------+
|266037|Central London with Stunning Views!               |1.5      |1       |
|268398|Also five minutes to South Bank                   |1.5      |1       |
|24328 |Battersea live/work artist house                  |1.5      |1       |
|432841|Large double bedroom in Shoreditch w/garden       |1.5      |1       |
|433867|Bedroom In Great Location Stratford               |1.5      |1       |
|283569|Spacious luxury 2 bedroom apartment               |1.5      |1       |
|437722|Very Central! Bayswater Apartment                 |2.0      |1       |
|442457|Room in London with a family                      |1.5      |1       |
|445346|Stunning large room (double sofa bed), Hackney, E9|1.5      |1       |
|

                                                                                

In [20]:
listings.select("price").where(col("price").isNotNull()).show(10, truncate=False)

+-------+
|price  |
+-------+
|$297.00|
|$98.00 |
|$148.00|
|$144.00|
|$157.00|
|$148.00|
|$120.00|
|$216.00|
|$238.00|
|$62.00 |
+-------+
only showing top 10 rows



In [21]:
# 4. Get properties where the price is greater than 5,000. Collect the result as a Python list
# Remember to convert a price into a number first!
from pyspark.sql.functions import regexp_replace

# Convert price from string to float
listings_num = listings.withColumn(
    "price_num",
    regexp_replace(col("price"), "[$,]", "").cast("double")
)

# Filter on price > 5000
expensive_props = listings_num.filter(col("price_num") > 5000)

# Collect results as a Python list
result = expensive_props.select("id", "name", "price_num").collect()

print("Number of properties with price > 5000:", len(result))
print(result[:5])

[Stage 23:>                                                         (0 + 1) / 1]

Number of properties with price > 5000: 82
[Row(id=9470827, name='Room in a cosy flat. Central, clean', price_num=8000.0), Row(id=10475894, name='Spacious Private Ground Floor Room', price_num=6308.0), Row(id=13254774, name='No Longer Available', price_num=53588.0), Row(id=13841484, name='Bright & airy DoubleBed with EnSuite in Zone 2!', price_num=74100.0), Row(id=17709189, name='Stunning home overlook canary wharf', price_num=7360.0)]


                                                                                

In [8]:
# 5. Get a list of properties with the following characteristics:
# * price < 150
# * more than 20 reviews
# * review_scores_rating > 4.5
# Consider using the "&" operator


In [9]:
# 6. Get a list of properties with the following characteristics:
# * price < 150 OR more than one bathroom
# Use the "|" operator to implement the OR operator


In [10]:
# 7. Get the highest listing price in this dataset
# Consider using the "max" function from "pyspark.sql.functions"


In [11]:
# 8. Get the name and a price of property with the highest price
# Try to use "collect" method to get the highest price first, and then use it in a "filter" call 


In [12]:
# 9. Get the number of hosts in the dataset


In [13]:
# 10. Get listings with a first review in 2024
# Consider using the "year" function from "pyspark.sql.functions"
