### Download dataset 

In [0]:
import requests
import gzip
import shutil
import os
from pathlib import Path

# access url and city name
dbutils.widgets.text("city", "amsterdam") 
city = dbutils.widgets.get("city")
dbutils.widgets.text("url", "https://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2024-09-05/data/listings.csv.gz") 
url = dbutils.widgets.get("url")

download_path = "/Volumes/workspace/airbnb/airbnb//listings.csv.gz"
workspace_path = f"/Volumes/workspace/airbnb/airbnb/{city}.csv"

# Download the zipped raw data csv
print("Downloading file...")
response = requests.get(url, stream=True)
with open(download_path, "wb") as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)
print("Download completed.")

# Extract the GZ file
print("Extracting file...")
with gzip.open(download_path, 'rb') as f_in:
    with open(workspace_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
print("Extraction completed.")

### View raw data

In [0]:
%sql
SELECT * FROM default.listings LIMIT 10

### Load listings.csv dataset and select useful feature columns

The following columns could be of use in the price prediction: 

| Column Name              | Example Value | Description |
|--------------------------|--------------|-------------|
| name                     | "Private, quiet studio in the centre with terrace"            | title of the airbnb page, should be transformed to embedding           |
| description              | "All guests agree: the apartment  is perfect and the location even better. A real home away from home. Two bedrooms, a fully equipped kitchen, a living with a comfortable couch. Quiet area, next to the Museumplein with the 3 major Museums."            | description on the airbnb page, should be transformed to embedding       |
| neighborhood_overview    | "Near beach, harbor and canal. From livingroom you can see boats passing by"          |  description of the neighborhood, should be transformed to embedding         |
| neighborhood_cleansed    |    Centrum-West       | label for the neighborhood, needs to be one-hot-encoded           |
| property_type           | Private room in guest suite            | label for the property type, needs to be one-hot-encoded           |
| room_type                | Entire home/apt            | label for the room type, needs to be one-hot-encoded           |
| accommodates            | 4            | the number of guests           |
| bathrooms               | 1            | the number of bathrooms           |
| bedrooms                | 2            | the number of bedrooms           |
| beds                    | 1            | the number of beds          |
| amenities               | ["Central heating", "Shower gel", "Lake access"]            | array of categorical variables, needs to be one-hot-encoded           |
| availability_365        |   247          | number of days the airbnb is available per year           |
| review_scores_value     | 4.75            | review score for the value of the airbnb           |



In [0]:
import pandas as pd
import numpy as np

# Load the data into dataframe
df = spark.read.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.option("sep", ",") \
.option("escape", '"') \
.option("encoding", "UTF-8") \
.option("quote", '"') \
.option("multiLine", "true") \
.load("/Volumes/workspace/airbnb/airbnb/listings.csv").toPandas()

# Select specific columns
selected_columns = [
    "name", "description", "neighbourhood_cleansed",
    "property_type", "room_type", "accommodates", "bathrooms", "bathrooms_text", "bedrooms",
    "beds", "amenities", "availability_365", "review_scores_value", "price"
]

# Selecting the specified columns
df = df[selected_columns]

# Filter out records without price
df = df[df['price'].notna()]
df['price'] = df['price'].str.extract(r'(\d+\.?\d*)')
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Cutoff for cost/debug reasons
df = df.head(50)

# Display the first few rows
display(df)


### Pre-process columns one by one 

1. <b>Name</b>: create text embeddings to capture semantics embeddings

In [0]:
%pip install sentence-transformers

In [0]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model (optimized for sentence embeddings)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

df['name'] = df['name'].fillna("")
names = np.array(df['name'])

# Generate embeddings
name_embeddings = model.encode(names)


2. <b>Description</b>: create text embeddings to capture semantics embeddings

In [0]:
df['description'] = df['description'].fillna("")
descriptions = np.array(df['description'])
description_embeddings = model.encode(descriptions)

3. <b>Neighbourhood overview</b>: create text embeddings to capture semantics embeddings

In [0]:
from sklearn.preprocessing import OneHotEncoder

categories = np.array(df['neighbourhood_cleansed'])
encoder = OneHotEncoder()
neighborhood_ohe = encoder.fit_transform(categories.reshape(-1, 1))


5. <b>Property type</b>: create one-hot-encodings

In [0]:
categories = np.array(df['property_type'])
encoder = OneHotEncoder()
property_types_ohe = encoder.fit_transform(categories.reshape(-1, 1))

6. <b>Room type</b>: create one-hot-encodings

In [0]:
categories = np.array(df['room_type'])
encoder = OneHotEncoder()
room_types_ohe = encoder.fit_transform(categories.reshape(-1, 1))

7. <b>Accomodates</b>: normalize using Mix-Max scaling

In [0]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max Scaling
df['accommodates'].fillna(1, inplace=True)
accomodates_normalized = scaler.fit_transform(df[['accommodates']])

8. <b>Number of **bathrooms**</b>: normalize using Mix-Max scaling

In [0]:
df['bathrooms_text_num'] = df['bathrooms_text'].str.extract(r'(\d+\.?\d*)')
df['bathrooms_text_num'] = pd.to_numeric(df['bathrooms_text_num'], errors='coerce')
df['bathrooms'].fillna(df['bathrooms_text_num'], inplace=True)
df['bathrooms'].fillna(1, inplace=True)
bathrooms_normalized = scaler.fit_transform(df[['bathrooms']])

9. <b>Number of **bedrooms**</b>: normalize using Mix-Max scaling

In [0]:
df['bedrooms'].fillna(df['accommodates'], inplace=True)
bedrooms_normalized = scaler.fit_transform(df[['bedrooms']])

> 10. <b>Number of **beds**</b>: normalize using Mix-Max scaling

In [0]:
df['beds'].fillna(df['bedrooms'], inplace=True)
beds_normalized = scaler.fit_transform(df[['beds']])

> 11. <b>Amenities</b>: Find distribution and perform one-hot-encoding of top n occuring amenities

In [0]:
from collections import Counter
import pandas as pd
import numpy as np
import ast
from collections import defaultdict

# Flatten the list column and count occurrences
df['amenities'].fillna("[]", inplace=True)
df['amenities'] = df['amenities'].apply(ast.literal_eval) # only need to do this once, so if df state is preserved don't repeat this step
category_counts = Counter([item for sublist in df['amenities'] for item in sublist]).most_common()

# Look for amenities that indicate a high price
total_records = len(df)
amenity_price_sums = defaultdict(lambda: [0, 0])  # Format: {amenity: [total_price, count]}
for amenities, price in zip(df['amenities'], df['price']):
    for amenity in amenities:
        amenity_price_sums[amenity][0] += price  # Sum prices
        amenity_price_sums[amenity][1] += 1      # Count occurrences

# Filter amenities that appear in more than 5% of total records
threshold = 0.05 * total_records
filtered_amenities = {amenity: values for amenity, values in amenity_price_sums.items() if values[1] > threshold}

# Calculate average price for each filtered amenity
amenity_avg_prices = {amenity: total / count for amenity, (total, count) in filtered_amenities.items()}

# Sort by average price (descending)
sorted_amenity_avg_prices = sorted(amenity_avg_prices.items(), key=lambda x: x[1], reverse=True)

# ftake the first n items
n = 20
filtered_amenities = [label for label, avg_price in sorted_amenity_avg_prices][:n]

# create one-hot encoded matrix
amenity_ohe = np.zeros((len(df), len(filtered_amenities)))
amenity_indices = [np.nonzero(np.isin(filtered_amenities, amenity_array))[0] for amenity_array in df['amenities']]
for i, indices in enumerate(amenity_indices):
    amenity_ohe[i, indices] = 1




11. <b>Availability</b>: normalize using division by 365

In [0]:
df['availability_365'].fillna(df['availability_365'].mean(), inplace=True)
availability = np.array(df['availability_365'] / 365)

11. <b>Review score for value</b>: normalize using division by 5

In [0]:
df['review_scores_value'].fillna(df['review_scores_value'].mean(), inplace=True)
review_score_value = np.array(df['review_scores_value'] / 5)

### Concatenate feature columns and save cleaned dataset

In [0]:
from sklearn.model_selection import train_test_split

# Construct dataset features
columns = [name_embeddings, description_embeddings, neighborhood_ohe.toarray(), property_types_ohe.toarray(), room_types_ohe.toarray(), accomodates_normalized, bathrooms_normalized, bedrooms_normalized, beds_normalized, amenity_ohe, availability.reshape(-1,1), review_score_value.reshape(-1,1)]
columns = [np.array(column.astype(np.float32)) for column in columns]
x = np.concatenate(columns, axis=1)
y = np.array(df['price'])
  
# Convert NumPy arrays to Pandas DataFrame

# Generate column names
embedding_dim = 384
neighbourhood_labels = len(df['neighbourhood_cleansed'].unique())
property_type_labels = len(df['property_type'].unique())
room_type_labels = len(df['room_type'].unique())
amenity_ohe_dim = 20

# Generating column names
columns = (
    [f"name_embedding_{i}" for i in range(embedding_dim)] +
    [f"description_embedding_{i}" for i in range(embedding_dim)] +
    [f"neighbourhood_label_{i}" for i in range(neighbourhood_labels)] +
    [f"property_type_label_{i}" for i in range(property_type_labels)] +
    [f"room_type_label_{i}" for i in range(room_type_labels)] +
    ["accommodates_normalized"] +
    ["bathrooms_normalized"] +
    ["bedrooms_normalized"] +
    ["beds_normalized"] +
    [f"amenity_ohe_{i}" for i in range(amenity_ohe_dim)] +
    ["availability"] +
    ["review_scores"]
)

dataset = pd.DataFrame(x, columns=columns)
dataset["price"] = y

# Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(dataset)

# Save as Delta Table in Databricks
table_name = f"{city}_airbnb_dataset"
spark_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(table_name)
