# Processing Airbnb Data

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Airbnb data processing").getOrCreate()

25/07/13 21:48:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
listings = spark.read.csv(
    "data/listings.csv.gz", 
    header=True, 
    inferSchema=True, 
    sep=",", 
    quote='"', 
    escape='"', 
    multiLine=True, 
    mode="PERMISSIVE"
)

                                                                                

In [3]:
review_locations = listings.select(listings.review_scores_location)
review_locations.show()

+----------------------+
|review_scores_location|
+----------------------+
|                  4.62|
|                  4.54|
|                  4.36|
|                   4.5|
|                  4.37|
|                   4.5|
|                  4.53|
|                  4.52|
|                  4.49|
|                  4.94|
|                  4.82|
|                  4.92|
|                  4.14|
|                  4.78|
|                  4.43|
|                  4.73|
|                  NULL|
|                  NULL|
|                  4.48|
|                  4.75|
+----------------------+
only showing top 20 rows


In [4]:
high_score_listings = listings \
    .filter(listings.review_scores_location > 4.5) \
    .select('id', 'price', 'name', 'review_scores_location')
high_score_listings.show(20, truncate=False)

+------+-------+------------------------------------------------+----------------------+
|id    |price  |name                                            |review_scores_location|
+------+-------+------------------------------------------------+----------------------+
|264776|$297.00|Huge Four Bedroom Apartment                     |4.62                  |
|264777|$98.00 |One Bedroom Apartment                           |4.54                  |
|264782|$120.00|One Bedroom Garden Apartment                    |4.53                  |
|264783|$216.00|Four Bedroom Garden Apartment                   |4.52                  |
|266037|$62.00 |Central London with Stunning Views!             |4.94                  |
|268398|$66.00 |Also five minutes to South Bank                 |4.82                  |
|270600|$73.00 |Patio Apartment in London (Twickenham)          |4.92                  |
|425143|NULL   |luxury 1bed in Chelsea Bridge Wharf             |4.78                  |
|426354|$200.00|1 bed

In [5]:
# Remove Rows with Null
high_score_listings.dropna().show(20, truncate=False)

+------+-------+------------------------------------------------+----------------------+
|id    |price  |name                                            |review_scores_location|
+------+-------+------------------------------------------------+----------------------+
|264776|$297.00|Huge Four Bedroom Apartment                     |4.62                  |
|264777|$98.00 |One Bedroom Apartment                           |4.54                  |
|264782|$120.00|One Bedroom Garden Apartment                    |4.53                  |
|264783|$216.00|Four Bedroom Garden Apartment                   |4.52                  |
|266037|$62.00 |Central London with Stunning Views!             |4.94                  |
|268398|$66.00 |Also five minutes to South Bank                 |4.82                  |
|270600|$73.00 |Patio Apartment in London (Twickenham)          |4.92                  |
|426354|$200.00|1 bedroom flat with big balcony!                |4.73                  |
|427584|$129.00|Hackn

In [7]:
# Remove the dollar sign from Price and convert to numerical
from pyspark.sql.functions import regexp_replace

price_num_df = listings.withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float'))

price_num_df.schema['price_num']

StructField('price_num', FloatType(), True)

In [8]:
price_num_df.select('price_num', 'name').show(20, truncate=False)

+---------+--------------------------------------------------+
|price_num|name                                              |
+---------+--------------------------------------------------+
|297.0    |Huge Four Bedroom Apartment                       |
|98.0     |One Bedroom Apartment                             |
|148.0    |Two Bedroom Newly Refurbished Apartment           |
|144.0    |Refurbished Two Bedroom Apartment                 |
|157.0    |Spacious refurbished 2 bedroom apt with balcony   |
|148.0    |Two Bedrooms Garden Maisonette                    |
|120.0    |One Bedroom Garden Apartment                      |
|216.0    |Four Bedroom Garden Apartment                     |
|238.0    |Huge Three Bedroom Flat with parking and terrace  |
|62.0     |Central London with Stunning Views!               |
|66.0     |Also five minutes to South Bank                   |
|73.0     |Patio Apartment in London (Twickenham)            |
|NULL     |Heathrow BNB - Home Away From Home!         

In [9]:
# Finding hidden gems
# Keep lower price and higher ratings
price_num_df.filter( (price_num_df.price_num < 100) & (price_num_df.review_scores_location > 4.5) ) \
    .select('name', 'price', 'review_scores_location') \
    .show(truncate=False)

+-------------------------------------------------+------+----------------------+
|name                                             |price |review_scores_location|
+-------------------------------------------------+------+----------------------+
|One Bedroom Apartment                            |$98.00|4.54                  |
|Central London with Stunning Views!              |$62.00|4.94                  |
|Also five minutes to South Bank                  |$66.00|4.82                  |
|Patio Apartment in London (Twickenham)           |$73.00|4.92                  |
|Boutique Room w/ Private Bath, Balcony           |$70.00|4.88                  |
|Holiday London DB Room Let-on going              |$72.00|4.77                  |
|You are GUARANTEED to love this                  |$74.00|4.77                  |
|Bedroom In Great Location Stratford              |$52.00|4.57                  |
|Bright Double + workspace in spacious Garden Flat|$51.00|4.77                  |
|SPACIOUS ROOM I

In [10]:
# Filtering conditions can also be defined as string
price_num_df.filter('price_num < 100 AND review_scores_location > 4.5') \
    .select('name', 'price', 'review_scores_location') \
    .show(truncate=False)

+-------------------------------------------------+------+----------------------+
|name                                             |price |review_scores_location|
+-------------------------------------------------+------+----------------------+
|One Bedroom Apartment                            |$98.00|4.54                  |
|Central London with Stunning Views!              |$62.00|4.94                  |
|Also five minutes to South Bank                  |$66.00|4.82                  |
|Patio Apartment in London (Twickenham)           |$73.00|4.92                  |
|Boutique Room w/ Private Bath, Balcony           |$70.00|4.88                  |
|Holiday London DB Room Let-on going              |$72.00|4.77                  |
|You are GUARANTEED to love this                  |$74.00|4.77                  |
|Bedroom In Great Location Stratford              |$52.00|4.57                  |
|Bright Double + workspace in spacious Garden Flat|$51.00|4.77                  |
|SPACIOUS ROOM I

In [11]:
# Find unique property type
listings.select(listings.property_type).distinct().show(truncate=False)

[Stage 8:>                                                          (0 + 1) / 1]

+----------------------------------+
|property_type                     |
+----------------------------------+
|Private room in lighthouse        |
|Private room in loft              |
|Private room in earthen home      |
|Entire chalet                     |
|Earthen home                      |
|Farm stay                         |
|Entire rental unit                |
|Shared room in hostel             |
|Shared room                       |
|Private room in condo             |
|Room in boutique hotel            |
|Private room in religious building|
|Room in bed and breakfast         |
|Private room in casa particular   |
|Private room in bungalow          |
|Entire cabin                      |
|Entire guesthouse                 |
|Hut                               |
|Private room in nature lodge      |
|Entire guest suite                |
+----------------------------------+
only showing top 20 rows


                                                                                

In [12]:
# Get all distinct combinations of property type and room type
listings \
    .select(listings.property_type, listings.room_type) \
    .distinct() \
    .show(truncate=False)

[Stage 11:>                                                         (0 + 1) / 1]

+----------------------------------+---------------+
|property_type                     |room_type      |
+----------------------------------+---------------+
|Room in hostel                    |Hotel room     |
|Private room in casa particular   |Private room   |
|Dome                              |Entire home/apt|
|Entire serviced apartment         |Entire home/apt|
|Private room in loft              |Private room   |
|Shipping container                |Entire home/apt|
|Private room in villa             |Private room   |
|Farm stay                         |Entire home/apt|
|Room in hotel                     |Hotel room     |
|Shared room in rental unit        |Shared room    |
|Private room in guest suite       |Private room   |
|Room in rental unit               |Hotel room     |
|Room in serviced apartment        |Hotel room     |
|Private room in serviced apartment|Private room   |
|Private room in hostel            |Private room   |
|Shared room                       |Shared roo

                                                                                

In [13]:
# Write data to file
listings \
    .select(listings.property_type) \
    .distinct() \
    .write \
    .csv('data/property_types')

                                                                                

In [14]:
# 1. Get a non-null picture URL for any property
listings.filter(
    listings.picture_url.isNotNull()
) \
    .select('picture_url') \
    .limit(1) \
    .show(truncate=False)

+----------------------------------------------------------------------------------------------------------+
|picture_url                                                                                               |
+----------------------------------------------------------------------------------------------------------+
|https://a0.muscache.com/pictures/hosting/Hosting-264776/original/3cc7b93f-dbda-4ded-ac15-e9d96691e7ca.jpeg|
+----------------------------------------------------------------------------------------------------------+



In [16]:
# 2. Get number of properties that get more than 10 reviews per month
listings.filter(listings.reviews_per_month > 10).count()

                                                                                

57

In [17]:
# 3. Get properties that have more bathrooms than bedrooms
listings.filter(
    listings.bathrooms > listings.bedrooms
) \
    .select('name', 'bathrooms', 'bedrooms') \
    .show(10, truncate=False)

+--------------------------------------------------+---------+--------+
|name                                              |bathrooms|bedrooms|
+--------------------------------------------------+---------+--------+
|Central London with Stunning Views!               |1.5      |1       |
|Also five minutes to South Bank                   |1.5      |1       |
|Battersea live/work artist house                  |1.5      |1       |
|Large double bedroom in Shoreditch w/garden       |1.5      |1       |
|Bedroom In Great Location Stratford               |1.5      |1       |
|Spacious luxury 2 bedroom apartment               |1.5      |1       |
|Very Central! Bayswater Apartment                 |2.0      |1       |
|Room in London with a family                      |1.5      |1       |
|Stunning large room (double sofa bed), Hackney, E9|1.5      |1       |
|Cosy Double studio in Zone 2 Hammersmith (1)      |1.5      |1       |
+--------------------------------------------------+---------+--

In [18]:
# 5. Get properties where the price is greater than 5000.
# Collect result as python list.
# First convert price from string to number

listings_with_price = listings.withColumn('price_numeric', regexp_replace('price', '[$,]', '').cast('float'))

res = listings_with_price.filter((listings_with_price.price_numeric > 5000)).select('name', 'price').collect()

res

                                                                                

[Row(name='Room in a cosy flat. Central, clean', price='$8,000.00'),
 Row(name='Spacious Private Ground Floor Room', price='$6,308.00'),
 Row(name='No Longer Available', price='$53,588.00'),
 Row(name='Bright & airy DoubleBed with EnSuite in Zone 2!', price='$74,100.00'),
 Row(name='Stunning home overlook canary wharf', price='$7,360.00'),
 Row(name='The Apartments by The Sloane Club, L 2 Bedroom Apt', price='$7,377.00'),
 Row(name='Kensington- Luxury 2 bedroom ground floor flat', price='$7,796.00'),
 Row(name='Spacious London Flat', price='$5,034.00'),
 Row(name='Single room. 7ft x 9ft - Over looking garden', price='$5,700.00'),
 Row(name='Luxury modern apartment in Dulwich Village', price='$5,372.00'),
 Row(name='Beautiful 2 BR flat in Kilburn with free parking', price='$6,000.00'),
 Row(name='Semi-detached mews house in Knightsbridge.', price='$7,007.00'),
 Row(name='Bright & Comfortable Angel Apartment', price='$9,999.00'),
 Row(name='Affordable Spacious  Room on the edge of the ci

In [19]:
# 5. Get a list of properties with following characteristics
# price < 150
# more than 20 reviews
# review_score_ratings > 4.5

listings_with_price.filter(
    (listings_with_price.price_numeric < 150) &
    (listings_with_price.number_of_reviews > 20) &
    (listings_with_price.review_scores_rating > 4.5)
) \
    .select('name', 'price_numeric', 'number_of_reviews', 'review_scores_rating') \
    .show(truncate=False)

+--------------------------------------------------+-------------+-----------------+--------------------+
|name                                              |price_numeric|number_of_reviews|review_scores_rating|
+--------------------------------------------------+-------------+-----------------+--------------------+
|One Bedroom Apartment                             |98.0         |24               |4.58                |
|Refurbished Two Bedroom Apartment                 |144.0        |36               |4.64                |
|Central London with Stunning Views!               |62.0         |532              |4.9                 |
|Also five minutes to South Bank                   |66.0         |563              |4.63                |
|Patio Apartment in London (Twickenham)            |73.0         |88               |4.64                |
|Lovely 2 bedroom flat near Brixton, zone 2, London|135.0        |23               |4.77                |
|Hackney Stylish & light 1 bedroom Victorian f

In [20]:
# 6. Get a list of properties with following characteristics
# price < 150 or more than one bathroom

listings_with_price.filter(
    (listings_with_price.price_numeric < 150) |
    (listings_with_price.bathrooms > 1)
).select("name", "price_numeric", "bathrooms").show(truncate=False)

+--------------------------------------------------+-------------+---------+
|name                                              |price_numeric|bathrooms|
+--------------------------------------------------+-------------+---------+
|Huge Four Bedroom Apartment                       |297.0        |2.0      |
|One Bedroom Apartment                             |98.0         |1.0      |
|Two Bedroom Newly Refurbished Apartment           |148.0        |1.0      |
|Refurbished Two Bedroom Apartment                 |144.0        |1.0      |
|Spacious refurbished 2 bedroom apt with balcony   |157.0        |2.0      |
|Two Bedrooms Garden Maisonette                    |148.0        |2.0      |
|One Bedroom Garden Apartment                      |120.0        |1.0      |
|Four Bedroom Garden Apartment                     |216.0        |2.0      |
|Huge Three Bedroom Flat with parking and terrace  |238.0        |2.0      |
|Central London with Stunning Views!               |62.0         |1.5      |

In [21]:
# 7. Get the highest listing price in the dataset
from pyspark.sql.functions import max

listings_with_price.select(max('price_numeric')).show()

[Stage 31:>                                                         (0 + 1) / 1]

+------------------+
|max(price_numeric)|
+------------------+
|           74100.0|
+------------------+



                                                                                

In [22]:
# 8. Get the name and price of a property with highest number of reviews per month
res = listings_with_price.select(max('price_numeric').alias('max_price')).collect()
max_price = res[0]['max_price']
listings_with_price \
    .filter(listings_with_price.price_numeric == max_price) \
    .select('name', 'price') \
    .show()

[Stage 37:>                                                         (0 + 1) / 1]

+--------------------+----------+
|                name|     price|
+--------------------+----------+
|Bright & airy Dou...|$74,100.00|
+--------------------+----------+



                                                                                

In [23]:
# 9. Get the number of hosts in the dataset
listings.select('host_name').distinct().count()

                                                                                

16659

In [24]:
# 10. Get listings with a first review in 2024

from pyspark.sql.functions import year

listings \
    .filter(year(listings.first_review) == 2024) \
    .select('name', 'first_review') \
    .show(10, truncate=False)

+--------------------------------------------------+------------+
|name                                              |first_review|
+--------------------------------------------------+------------+
|Close to Wimbledon All England Tennis -huge double|2024-08-11  |
|Bridgerton inspired cottage core apartment        |2024-09-14  |
|one Double bed room with en-suite facilities      |2024-03-21  |
|Sm double room  with own bathroom                 |2024-06-04  |
|Superlux flat in Knightsbridge                    |2024-01-01  |
|Central, modern pied-a-terre                      |2024-11-29  |
|Stunning Bright Chelsea 2BR flat                  |2024-09-21  |
|Victorian 2-bedroom upstairs flat sleeps 4        |2024-12-09  |
|The Pink House, Notting Hill                      |2024-07-14  |
|Stylish garden flat in Hackney                    |2024-09-15  |
+--------------------------------------------------+------------+
only showing top 10 rows
