# Analysis of Airbnb data for Amsterdam, Athens, Berlin
##### Data sourced from:
Gyódi, K., & Nawaro, Ł. (2021). Determinants of Airbnb prices in European cities: A spatial econometrics approach (Supplementary Material) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.4446043

In [2]:
library(tidyverse)
library(repr)
library(infer)
library(cowplot)
library(broom)
library(GGally)
library(AER)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘cowplot’


The following object is masked from ‘package:lubridate’:

    stamp


Registered S3 method overwritten by 'GGally':
  method from   
  +.

In [13]:
## Initial loading and wrangling. Ensure directory matches. 
amsterdam_weekdays <- read.csv("amsterdam_weekdays.csv") %>% as_tibble() %>% select(-X) %>% mutate(city = "amsterdam", day_type = "weekday")
amsterdam_weekends <- read.csv("amsterdam_weekends.csv") %>% as_tibble() %>% select(-X) %>% mutate(city = "amsterdam", day_type = "weekend")

athens_weekdays <- read.csv("athens_weekdays.csv") %>% as_tibble() %>% select(-X) %>% mutate(city = "athens", day_type = "weekday")
athens_weekends <- read.csv("athens_weekends.csv") %>% as_tibble() %>% select(-X) %>% mutate(city = "athens", day_type = "weekend")

berlin_weekdays <- read.csv("berlin_weekdays.csv") %>% as_tibble() %>% select(-X) %>% mutate(city = "berlin", day_type = "weekday")
berlin_weekends <- read.csv("berlin_weekends.csv") %>% as_tibble() %>% select(-X) %>% mutate(city = "berlin", day_type = "weekend")

airbnb <- bind_rows(amsterdam_weekdays, amsterdam_weekends, 
                   athens_weekdays, athens_weekends, 
                   berlin_weekdays, berlin_weekends) %>% 
                        mutate(room_type = as.factor(room_type), room_shared = as.factor(room_shared), 
                               multi = as.factor(multi), biz = as.factor(biz),
                               room_private = as.factor(room_private), host_is_superhost = as.factor(host_is_superhost), 
                                city = as.factor(city), day_type = as.factor(day_type))
head(airbnb)

realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,⋯,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,city,day_type
<dbl>,<fct>,<fct>,<fct>,<dbl>,<fct>,<fct>,<fct>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>
194.0337,Private room,False,True,2,False,1,0,10,93,⋯,5.0229638,2.53938,78.69038,4.166708,98.2539,6.846473,4.90569,52.41772,amsterdam,weekday
344.2458,Private room,False,True,4,False,0,0,8,85,⋯,0.4883893,0.2394039,631.17638,33.421209,837.28076,58.342928,4.90005,52.37432,amsterdam,weekday
264.1014,Private room,False,True,2,False,0,1,9,87,⋯,5.7483119,3.6516213,75.27588,3.985908,95.38695,6.6467,4.97512,52.36103,amsterdam,weekday
433.5294,Private room,False,True,4,False,0,1,9,90,⋯,0.384862,0.4398761,493.27253,26.119108,875.0331,60.973565,4.89417,52.37663,amsterdam,weekday
485.5529,Private room,False,True,2,True,0,0,10,98,⋯,0.5447382,0.3186926,552.83032,29.272733,815.30574,56.811677,4.90051,52.37508,amsterdam,weekday
552.8086,Private room,False,True,3,False,0,0,8,100,⋯,2.1314201,1.9046682,174.78896,9.255191,225.20166,15.692376,4.87699,52.38966,amsterdam,weekday


In [12]:
# Main developer: Zhuo Liu

# Packages needed for LASSO

library(tidymodels)
library(glmnet)

In [31]:
# Main developer: Zhuo Liu

# Use LASSO

set.seed(5033)

# convert categorical variables to dummy variables manually
airbnb_clean <- airbnb |>
    na.omit() |>
    mutate(room_sharedYes = if_else(room_shared == "True",1,0),
          room_privateYes = if_else(room_private == "True",1,0),
          host_is_superhostYes = if_else(host_is_superhost == "True",1,0),
          cityathens = if_else(city == "athens",1,0),
          cityberlin = if_else(city == "berlin",1,0),
          day_typeweekend = if_else(day_type == "weekend",1,0)) |>
    select(-room_type, -room_shared, -room_private, -host_is_superhost, -city, -day_type)

head(airbnb_clean)

split <- initial_split(data = airbnb_clean, prop = 0.7)
training_df <- training(split)
testing_df <- testing(split)

X_train <- as.matrix(training_df[,-1])
Y_train <- as.matrix(training_df[,1])

X_test <- as.matrix(testing_df[,-1])
Y_test <- as.matrix(testing_df[,1])

cv_LASSO <- cv.glmnet(
  x = X_train, y = Y_train,
  alpha = 1,
  lambda = exp(seq(-21, 21, 0.1))
)

lambda <- cv_LASSO$lambda.min
lambda

coef(cv_LASSO, s = "lambda.min")

test_pred_LASSO_min <- 
            predict(cv_LASSO, 
            newx = X_test, 
            s = "lambda.min")
LASSO_prediction <- tibble(Y_test,LASSO_prediction = test_pred_LASSO_min) %>% head()
LASSO_prediction

realSum,person_capacity,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,⋯,rest_index,rest_index_norm,lng,lat,room_sharedYes,room_privateYes,host_is_superhostYes,cityathens,cityberlin,day_typeweekend
<dbl>,<dbl>,<fct>,<fct>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
194.0337,2,1,0,10,93,1,5.0229638,2.53938,78.69038,⋯,98.2539,6.846473,4.90569,52.41772,0,1,0,0,0,0
344.2458,4,0,0,8,85,1,0.4883893,0.2394039,631.17638,⋯,837.28076,58.342928,4.90005,52.37432,0,1,0,0,0,0
264.1014,2,0,1,9,87,1,5.7483119,3.6516213,75.27588,⋯,95.38695,6.6467,4.97512,52.36103,0,1,0,0,0,0
433.5294,4,0,1,9,90,2,0.384862,0.4398761,493.27253,⋯,875.0331,60.973565,4.89417,52.37663,0,1,0,0,0,0
485.5529,2,0,0,10,98,1,0.5447382,0.3186926,552.83032,⋯,815.30574,56.811677,4.90051,52.37508,0,1,1,0,0,0
552.8086,3,0,0,8,100,2,2.1314201,1.9046682,174.78896,⋯,225.20166,15.692376,4.87699,52.38966,0,1,0,0,0,0


“collapsing to unique 'x' values”
“collapsing to unique 'x' values”
“collapsing to unique 'x' values”
“collapsing to unique 'x' values”
“collapsing to unique 'x' values”
“collapsing to unique 'x' values”
“collapsing to unique 'x' values”
“collapsing to unique 'x' values”
“collapsing to unique 'x' values”
“collapsing to unique 'x' values”


“collapsing to unique 'x' values”


21 x 1 sparse Matrix of class "dgCMatrix"
                                      s1
(Intercept)                 335.66575746
person_capacity              11.83860909
multi                         .         
biz                          31.41909926
cleanliness_rating            1.14352214
guest_satisfaction_overall    1.35849585
bedrooms                     87.22882995
dist                         -3.09208636
metro_dist                    0.31397141
attr_index                    0.15506052
attr_index_norm               1.06353469
rest_index                    0.05289341
rest_index_norm               1.92306653
lng                         -22.86284130
lat                           .         
room_sharedYes             -202.07342417
room_privateYes            -151.90097026
host_is_superhostYes          4.40886644
cityathens                    .         
cityberlin                  -47.92392051
day_typeweekend              18.05180618

“collapsing to unique 'x' values”


Y_test,LASSO_prediction
"<dbl[,1]>","<dbl[,1]>"
194.0337,340.5504
344.2458,619.4035
264.1014,357.9365
433.5294,724.4111
276.5215,392.8806
319.6401,406.774


In [33]:
# Main developer: Zhuo Liu

# Calculate test_RMSE of LASSO
test_RMSE_LASSO <- LASSO_prediction |> mutate(Y_test = as.numeric(Y_test),
                                       	LASSO_prediction = as.numeric(LASSO_prediction)) |>
    summarize(RMSE = sqrt(sum((Y_test - LASSO_prediction) ^ 2) / nrow(prediction)))
test_RMSE_LASSO

RMSE
<dbl>
187.8509
