### View raw data

In [0]:
%sql
SELECT * FROM default.listings LIMIT 10

### Load listings.csv dataset and select useful feature columns

The following columns could be of use in the price prediction: 

| Column Name              | Example Value | Description |
|--------------------------|--------------|-------------|
| name                     | "Private, quiet studio in the centre with terrace"            | title of the airbnb page, should be transformed to embedding           |
| description              | "All guests agree: the apartment  is perfect and the location even better. A real home away from home. Two bedrooms, a fully equipped kitchen, a living with a comfortable couch. Quiet area, next to the Museumplein with the 3 major Museums."            | description on the airbnb page, should be transformed to embedding       |
| neighborhood_overview    | "Near beach, harbor and canal. From livingroom you can see boats passing by"          |  description of the neighborhood, should be transformed to embedding         |
| neighborhood_cleansed    |    Centrum-West       | label for the neighborhood, needs to be one-hot-encoded           |
| property_type           | Private room in guest suite            | label for the property type, needs to be one-hot-encoded           |
| room_type                | Entire home/apt            | label for the room type, needs to be one-hot-encoded           |
| accommodates            | 4            | the number of guests           |
| bathrooms               | 1            | the number of bathrooms           |
| bedrooms                | 2            | the number of bedrooms           |
| beds                    | 1            | the number of beds          |
| amenities               | ["Central heating", "Shower gel", "Lake access"]            | array of categorical variables, needs to be one-hot-encoded           |
| availability_365        |   247          | number of days the airbnb is available per year           |
| review_scores_value     | 4.75            | review score for the value of the airbnb           |



In [0]:
# Load the data into dataframe
df = spark.read.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.option("sep", ",") \
.option("escape", '"') \
.option("encoding", "UTF-8") \
.option("quote", '"') \
.option("multiLine", "true") \
.load("/Volumes/workspace/airbnb/airbnb/listings.csv").toPandas()

# Select specific columns
selected_columns = [
    "name", "description", "neighborhood_overview", "neighbourhood_cleansed",
    "property_type", "room_type", "accommodates", "bathrooms", "bedrooms",
    "beds", "amenities", "availability_365", "review_scores_value", "price"
]

# Selecting the specified columns
df = df[selected_columns]

# Filter out records without price
df = df[df['price'].notna()]


# Display the first few rows
display(df)


### Pre-process columns one by one 

1. <b>Name</b>: create text embeddings to capture semantics embeddings

In [0]:
%pip install sentence-transformers

In [0]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model (optimized for sentence embeddings)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

names = np.array(df['name'])

# Generate embeddings
name_embeddings = model.encode(names)


2. <b>Description</b>: create text embeddings to capture semantics embeddings

In [0]:
df['description'] = df['description'].fillna("")
descriptions = np.array(df['description'])
description_embeddings = model.encode(descriptions)

3. <b>Neighbourhood overview</b>: create text embeddings to capture semantics embeddings

In [0]:
df['neighborhood_overview'] = df['neighborhood_overview'].fillna("")
neighborhood_overviews = np.array(df['neighborhood_overview'])
neighborhood_overview_embeddings = model.encode(neighborhood_overviews)

4. <b>Neighbourhood cleansed</b>: create one-hot-encodings

In [0]:
from sklearn.preprocessing import OneHotEncoder

categories = np.array(df['neighbourhood_cleansed'])
encoder = OneHotEncoder()
neighborhood_ohe = encoder.fit_transform(categories.reshape(-1, 1))


5. <b>Property type</b>: create one-hot-encodings

In [0]:
categories = np.array(df['property_type'])
encoder = OneHotEncoder()
property_types_ohe = encoder.fit_transform(categories.reshape(-1, 1))

6. <b>Room type</b>: create one-hot-encodings

In [0]:
categories = np.array(df['room_type'])
encoder = OneHotEncoder()
room_types_ohe = encoder.fit_transform(categories.reshape(-1, 1))

7. <b>Accomodates</b>: normalize using Mix-Max scaling

In [0]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max Scaling
accomodates_normalized = scaler.fit_transform(df[['accommodates']])

8. <b>Number of **bathrooms**</b>: normalize using Mix-Max scaling

In [0]:
bathrooms_normalized = scaler.fit_transform(df[['bathrooms']])

![9](path). <b>Number of **bedrooms**</b>: normalize using Mix-Max scaling

In [0]:
bedrooms_normalized = scaler.fit_transform(df[['bedrooms']])

> 10. <b>Number of **beds**</b>: normalize using Mix-Max scaling

In [0]:
beds_normalized = scaler.fit_transform(df[['beds']])

> 11. <b>Amenities</b>: Find distribution and perform one-hot-encoding of top n occuring amenities

In [0]:
from collections import Counter
import pandas as pd
import numpy as np
import ast

# Flatten the list column and count occurrences
#df['amenities'] = df['amenities'].apply(ast.literal_eval) # only need to do this once, so if df state is preserved don't repeat this step
category_counts = Counter([item for sublist in df['amenities'] for item in sublist]).most_common()

# filter on frequency threshold n
n = 100
filtered_amenities = [label for label, count in category_counts if count > n]

# create one-hot encoded matrix
amenity_ohe = np.zeros((len(df), len(filtered_amenities)))
amenity_indices = [np.nonzero(np.isin(filtered_amenities, amenity_array))[0] for amenity_array in df['amenities']]
for i, indices in enumerate(amenity_indices):
    amenity_ohe[i, indices] = 1




11. <b>Availability</b>: normalize using division by 365

In [0]:
availability = np.array(df['availability_365'] / 365)

11. <b>Review score for value</b>: normalize using division by 5

In [0]:
review_score_value = np.array(df['review_scores_value'] / 5)

### Concatenate feature columns and construct cleaned dataset

In [0]:
columns = [name_embeddings, description_embeddings, neighborhood_overview_embeddings, neighborhood_ohe.toarray(), property_types_ohe.toarray(), room_types_ohe.toarray(), accomodates_normalized, bathrooms_normalized, bedrooms_normalized, beds_normalized, amenity_ohe, availability.reshape(-1,1), review_score_value.reshape(-1,1)]
columns = [np.array(column.astype(np.float32)) for column in columns]
dataset = np.concatenate(columns, axis=1)

print(dataset.shape)
  
