### Check Environment

In [1]:
import os
databricks = "DATABRICKS_RUNTIME_VERSION" in os.environ

### Init Local

In [2]:
if not databricks:
    
    from pyspark.sql import SparkSession
    
    # Set up environment variables (optional, but sometimes needed)
    os.environ["PYSPARK_PYTHON"] = "python3"
    os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11"
    
    # Initialize a Spark session
    spark = SparkSession.builder \
        .appName("LocalPySpark") \
        .master("local[*]") \
        .config("spark.driver.memory", "4g") \
        .config("spark.sql.execution.arrow.enabled", "true") \
        .config("spark.driver.host", "127.0.0.1") \
        .config("spark.driver.bindAddress", "127.0.0.1") \
        .getOrCreate()
    
    url = "https://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2024-09-05/data/listings.csv.gz"
    city = "amsterdam"
    download_path = ".//listings.csv.gz"
    raw_data_path = f".//{city}.csv"

### Init Databricks 

In [3]:
if databricks:

    # access url and city name
    dbutils.widgets.text("city", "amsterdam") 
    city = dbutils.widgets.get("city")
    dbutils.widgets.text("url", "https://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2024-09-05/data/listings.csv.gz") 
    url = dbutils.widgets.get("url")
    download_path = "/Volumes/workspace/airbnb/airbnb//listings.csv.gz"
    raw_data_path = f"/Volumes/workspace/airbnb/airbnb/{city}.csv"

### Download dataset 

In [4]:
import requests
import gzip
import shutil
import os
from pathlib import Path

# Download the zipped raw data csv
print("Downloading file...")
response = requests.get(url, stream=True)
with open(download_path, "wb") as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)
print("Download completed.")

# Extract the GZ file
print("Extracting file...")
with gzip.open(download_path, 'rb') as f_in:
    with open(raw_data_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
print("Extraction completed.")

if os.path.exists(download_path):
    os.remove(download_path)  # Delete the downloaded .gz file
    print(f"Deleted: {download_path}")

Downloading file...
Download completed.
Extracting file...
Extraction completed.
Deleted: .//listings.csv.gz


### View raw data

In [5]:
#%sql
#SELECT * FROM default.listings LIMIT 10

### Load listings.csv dataset and select useful feature columns

The following columns could be of use in the price prediction: 

| Column Name              | Example Value | Description |
|--------------------------|--------------|-------------|
| name                     | "Private, quiet studio in the centre with terrace"            | title of the airbnb page, should be transformed to embedding           |
| description              | "All guests agree: the apartment  is perfect and the location even better. A real home away from home. Two bedrooms, a fully equipped kitchen, a living with a comfortable couch. Quiet area, next to the Museumplein with the 3 major Museums."            | description on the airbnb page, should be transformed to embedding       |
| neighborhood_cleansed    |    Centrum-West       | label for the neighborhood, needs to be one-hot-encoded           |
| property_type           | Private room in guest suite            | label for the property type, needs to be one-hot-encoded           |
| room_type                | Entire home/apt            | label for the room type, needs to be one-hot-encoded           |
| accommodates            | 4            | the number of guests           |
| bathrooms               | 1            | the number of bathrooms           |
| bedrooms                | 2            | the number of bedrooms           |
| beds                    | 1            | the number of beds          |
| amenities               | ["Central heating", "Shower gel", "Lake access"]            | array of categorical variables, needs to be multi-hot-encoded           |
| availability_365        |   247          | number of days the airbnb is available per year           |
| review_scores_value     | 4.75            | review score for the value of the airbnb           |



In [6]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import col, regexp_extract

# Load the data into dataframe
df = spark.read.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.option("sep", ",") \
.option("escape", '"') \
.option("encoding", "UTF-8") \
.option("quote", '"') \
.option("multiLine", "true") \
.load(raw_data_path)

# Select specific columns
selected_columns = [
    "name", "description", "neighbourhood_cleansed",
    "property_type", "room_type", "accommodates", "bathrooms", "bathrooms_text", "bedrooms",
    "beds", "amenities", "availability_365", "review_scores_value", "price"
]

# Selecting the specified columns
df = df.select(*selected_columns)

# Filter out records without price
df = df.filter(col("price").isNotNull())
df = df.withColumn("price", regexp_extract(col("price"), r"(\d+\.?\d*)", 1))
df = df.withColumn("price", col("price").cast("double"))
df = df.toPandas()
# Display the first few rows
display(df)


Unnamed: 0,name,description,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,availability_365,review_scores_value,price
0,Triple Room With City View,This tastefully decorated room has an elegant ...,Centrum-West,Room in hotel,Private room,3,1.0,1 bath,1.0,3.0,"[""Paid parking on premises"", ""Body soap"", ""Cof...",323,4.75,269.0
1,Private loft next @Vondelpark with PRIVATE par...,Vintage & private loft centrally situated in t...,De Baarsjes - Oud-West,Private room in bed and breakfast,Private room,4,1.0,1 private bath,2.0,5.0,"[""Dishes and silverware"", ""Fast wifi \u2013 98...",275,4.62,254.0
2,Ecoluxe accommodation 'The Green Tunnel',"Spacious, ground level private ecoluxe two-roo...",IJburg - Zeeburgereiland,Private room in guesthouse,Private room,4,1.5,1.5 baths,2.0,2.0,"[""Dishes and silverware"", ""Toaster"", ""Shower g...",10,4.74,203.0
3,Appartment in centre with jacuzzi,Ideal for a family and is equipped with all th...,De Baarsjes - Oud-West,Entire loft,Entire home/apt,4,2.0,2 baths,2.0,2.0,"[""Paid parking on premises"", ""Smoke alarm"", ""L...",60,,375.0
4,Amazing apartment canal view and private terrace!,Great place to stay in the middle of the city ...,Centrum-West,Entire condo,Entire home/apt,2,1.0,1 bath,1.0,1.0,"[""Dishes and silverware"", ""Toaster"", ""Shower g...",247,4.71,599.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5864,Luxe appartement Amsterdam,This stylish accommodation is perfect for grou...,Osdorp,Entire rental unit,Entire home/apt,3,1.0,1 bath,2.0,2.0,"[""Smoke alarm"", ""Washer"", ""TV"", ""Free parking ...",221,,125.0
5865,Vrolik,"In this centrally located accommodation, every...",Oud-Oost,Entire rental unit,Entire home/apt,3,1.0,1 bath,2.0,2.0,"[""Paid parking on premises"", ""Smoke alarm"", ""S...",163,,196.0
5866,Kids friendly home next to park,"Welcome to our family home, nestled in the hea...",Westerpark,Entire serviced apartment,Entire home/apt,4,2.0,2 baths,2.0,3.0,"[""Dishes and silverware"", ""Toaster"", ""Shower g...",189,4.83,409.0
5867,Cosy home in city center,Cosy home of two floors in the city centre of ...,De Pijp - Rivierenbuurt,Entire rental unit,Entire home/apt,4,1.0,1 bath,2.0,1.0,"[""Smoke alarm"", ""Smoking allowed"", ""Washer"", ""...",331,,349.0


### Clean columns one by one 

In [7]:
df['name'] = df['name'].fillna("No name")

In [8]:
df['description'] = df['description'].fillna("No description")

In [9]:
df['accommodates'].fillna(1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['accommodates'].fillna(1, inplace=True)


In [10]:
df['bathrooms_text_num'] = df['bathrooms_text'].str.extract(r'(\d+\.?\d*)')
df['bathrooms_text_num'] = pd.to_numeric(df['bathrooms_text_num'], errors='coerce')
df['bathrooms'].fillna(df['bathrooms_text_num'], inplace=True)
df['bathrooms'].fillna(1, inplace=True)
df = df.drop(columns=['bathrooms_text', 'bathrooms_text_num'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bathrooms'].fillna(df['bathrooms_text_num'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bathrooms'].fillna(1, inplace=True)


In [11]:
df['bedrooms'].fillna(df['accommodates'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bedrooms'].fillna(df['accommodates'], inplace=True)


In [12]:
df['beds'].fillna(df['bedrooms'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['beds'].fillna(df['bedrooms'], inplace=True)


Here we select some amenities that are associated with higher prices and that occur reasonalby frequent. The others are removed

In [13]:
from collections import Counter
import pandas as pd
import numpy as np
import ast
from collections import defaultdict

# Flatten the list column and count occurrences
df['amenities'].fillna("[]", inplace=True)
df['amenities'] = df['amenities'].apply(ast.literal_eval) # only need to do this once, so if df state is preserved don't repeat this step
category_counts = Counter([item for sublist in df['amenities'] for item in sublist]).most_common()

# Look for amenities that indicate a high price
total_records = len(df)
amenity_price_sums = defaultdict(lambda: [0, 0])  # Format: {amenity: [total_price, count]}
for amenities, price in zip(df['amenities'], df['price']):
    for amenity in amenities:
        amenity_price_sums[amenity][0] += price  # Sum prices
        amenity_price_sums[amenity][1] += 1      # Count occurrences

# Filter amenities that appear in more than 5% of total records
threshold = 0.05 * total_records
filtered_amenities = {amenity: values for amenity, values in amenity_price_sums.items() if values[1] > threshold}

# Calculate average price for each filtered amenity
amenity_avg_prices = {amenity: total / count for amenity, (total, count) in filtered_amenities.items()}

# Sort by average price (descending)
sorted_amenity_avg_prices = sorted(amenity_avg_prices.items(), key=lambda x: x[1], reverse=True)

# ftake the first n items
n = 20
interesting_amenities = [label for label, avg_price in sorted_amenity_avg_prices][:n]

# Remove amenities that are not interesting
df['amenities'] = df['amenities'].apply(lambda x: [a for a in x if a in interesting_amenities])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['amenities'].fillna("[]", inplace=True)


In [14]:
df['availability_365'].fillna(df['availability_365'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['availability_365'].fillna(df['availability_365'].mean(), inplace=True)


In [15]:
df['review_scores_value'].fillna(df['review_scores_value'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['review_scores_value'].fillna(df['review_scores_value'].mean(), inplace=True)


### Save cleaned dataset

In [16]:
from pyspark.sql.functions import col, sum

spark_df = spark.createDataFrame(df)

# Print any NaN values in the df
nan_counts = spark_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in spark_df.columns])
nan_counts.show()

if databricks:

    # Save as Delta Table
    table_name = f"{city}_airbnb_dataset"
    spark_df.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(table_name)

else:
    df.to_csv(f"{city}_airbnb_dataset.csv", index=False)

# delete the loaded csv
if os.path.exists(raw_data_path):
    os.remove(raw_data_path)  # Delete the raw csv
    print(f"Deleted: {raw_data_path}")

+----+-----------+----------------------+-------------+---------+------------+---------+--------+----+---------+----------------+-------------------+-----+
|name|description|neighbourhood_cleansed|property_type|room_type|accommodates|bathrooms|bedrooms|beds|amenities|availability_365|review_scores_value|price|
+----+-----------+----------------------+-------------+---------+------------+---------+--------+----+---------+----------------+-------------------+-----+
|   0|          0|                     0|            0|        0|           0|        0|       0|   0|        0|               0|                  0|    0|
+----+-----------+----------------------+-------------+---------+------------+---------+--------+----+---------+----------------+-------------------+-----+

Deleted: .//amsterdam.csv
