#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 09
**CH09B How stable is the hotel price - distance to center relationship?**

using the hotels-europe dataset

version 1.0 2021-05-05

In [None]:
import os
import sys
import warnings
import pyfixest as pf
import numpy as np
import pandas as pd


warnings.filterwarnings("ignore")

In [2]:
# Current script folder
current_path = os.getcwd()
dirname = current_path.split("da_case_studies")[0]

# location folders
data_in = dirname + "da_data_repo/hotels-europe/clean/"
data_out = dirname + "da_case_studies/ch09-hotels-europe-stability/"
output = dirname + "da_case_studies/ch09-hotels-europe-stability/output/"
func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)


In [3]:
# Import the prewritten helper functions
import py_helper_functions as da

In [4]:
# load in clean and tidy data and create workfile
hotels_europe_price = pd.read_csv(data_in + "hotels-europe_price.csv")
hotels_europe_features = pd.read_csv(data_in + "hotels-europe_features.csv")

#hotels_europe_price = pd.read_csv("https://osf.io/download/p6tyr/")
#hotels_europe_features = pd.read_csv("https://osf.io/download/utwjs/")


In [5]:
data = pd.merge(hotels_europe_price, hotels_europe_features, on="hotel_id", how="left")


In [6]:
# filter a few cities
data = data.loc[data["city_actual"].isin(["Vienna", "Amsterdam", "Barcelona"])]


In [7]:
data = data.loc[data["accommodation_type"].isin(["Hotel", "Apartment"])]


In [8]:
# drop long stay , 1000E+
data = data[data["nnights"] != 4]
data = data[data["price"] < 1000]


In [9]:
# check for duplicates
data = data.drop_duplicates()


In [10]:
# filter for days
data.loc[(data["month"] == 11) & (data["weekend"] == 0), "date"] = "2017-NOV-weekday"
data.loc[(data["month"] == 11) & (data["weekend"] == 1), "date"] = "2017-NOV-weekend"
data.loc[(data["month"] == 12) & (data["holiday"] == 1), "date"] = "2017-DEC-holiday"
data.loc[(data["month"] == 6) & (data["weekend"] == 1), "date"] = "2018-JUNE-weekend"


In [11]:
data = data[data["date"].notna()]


In [12]:
data["city"].value_counts()


city
Barcelona    1564
Vienna       1326
Amsterdam     830
Name: count, dtype: int64

In [13]:
pd.crosstab(index=data["accommodation_type"], columns=data["city"])


city,Amsterdam,Barcelona,Vienna
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apartment,31,300,457
Hotel,799,1264,869


In [14]:
pd.crosstab(index=data["date"], columns=data["city"])


city,Amsterdam,Barcelona,Vienna
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-DEC-holiday,290,420,338
2017-NOV-weekday,315,452,377
2017-NOV-weekend,125,393,256
2018-JUNE-weekend,100,299,355


In [15]:
data["lnprice"] = np.log(data["price"])

In [16]:
data = data.filter(
    [
        "hotel_id",
        "date",
        "city",
        "accommodation_type",
        "stars",
        "rating",
        "distance",
        "price",
        "lnprice",
    ]
)


In [17]:
data.to_csv(os.path.join(data_out, "hotels_work.csv"), index=False)


In [18]:
data = data.loc[
    (data["stars"] >= 3)
    & (data["stars"] <= 4)
    & (data["accommodation_type"] == "Hotel")
    & (data["city"] == "Vienna")
]

In [19]:
data["date"].value_counts()


date
2017-NOV-weekday     207
2017-DEC-holiday     189
2018-JUNE-weekend    181
2017-NOV-weekend     125
Name: count, dtype: int64

In [20]:
data[["distance", "price", "lnprice"]].describe().round(2)


Unnamed: 0,distance,price,lnprice
count,702.0,702.0,702.0
mean,1.57,122.75,4.74
std,1.15,53.3,0.37
min,0.0,50.0,3.91
25%,0.8,86.0,4.45
50%,1.4,109.0,4.69
75%,1.9,144.0,4.97
max,6.6,491.0,6.2


In [21]:
data.groupby("date")["distance"].describe().round(3)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,1.503,1.06,0.0,0.8,1.3,1.9,5.2
2017-NOV-weekday,207.0,1.53,1.162,0.0,0.8,1.3,1.9,6.6
2017-NOV-weekend,125.0,1.773,1.298,0.0,0.9,1.6,2.1,6.6
2018-JUNE-weekend,181.0,1.531,1.13,0.0,0.8,1.3,1.9,6.6


In [22]:
data.groupby("date")["price"].describe().round(3)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,116.492,46.308,57.0,85.0,103.0,138.0,386.0
2017-NOV-weekday,207.0,109.976,42.221,50.0,82.0,100.0,129.5,383.0
2017-NOV-weekend,125.0,149.144,76.531,60.0,92.0,132.0,180.0,491.0
2018-JUNE-weekend,181.0,125.674,45.054,59.0,94.0,111.0,154.0,297.0


In [23]:
data.groupby("date")["lnprice"].describe().round(3)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,4.697,0.335,4.043,4.443,4.635,4.927,5.956
2017-NOV-weekday,207.0,4.64,0.337,3.912,4.407,4.605,4.864,5.948
2017-NOV-weekend,125.0,4.902,0.438,4.094,4.522,4.883,5.193,6.196
2018-JUNE-weekend,181.0,4.776,0.334,4.078,4.543,4.71,5.037,5.694


## Regression with splines


### Table 9.3 External validity – comparing dates

In [24]:
dates = [
    "2017-NOV-weekday",
    "2017-NOV-weekend",
    "2017-DEC-holiday",
    "2018-JUNE-weekend",
]


In [25]:
from py_helper_functions import lspline

In [26]:
models = []
for date in dates:
    models.append(
         pf.feols("lnprice ~ lspline(distance,[2])", data=data.loc[lambda x: x["date"] == date],vcov="HC1",context=0)
    )

In [27]:
pf.etable(
    models,
    model_heads=dates,
    labels={
        "Intercept": "Constant",
        "lspline(distance, [2])[0]": "Distance spline <2",
        "lspline(distance, [2])[1]": "Distance spline 2–7"
    },
)

Unnamed: 0_level_0,lnprice,lnprice,lnprice,lnprice
Unnamed: 0_level_1,2017-NOV-weekday,2017-NOV-weekend,2017-DEC-holiday,2018-JUNE-weekend
Unnamed: 0_level_2,(1),(2),(3),(4)
coef,coef,coef,coef,coef
Distance spline <2,-0.309*** (0.038),-0.444*** (0.052),-0.362*** (0.041),-0.313*** (0.037)
Distance spline 2–7,0.024 (0.033),-0.005 (0.036),0.070 (0.050),0.037 (0.039)
Constant,5.015*** (0.042),5.507*** (0.067),5.133*** (0.048),5.159*** (0.050)
stats,stats,stats,stats,stats
Observations,207,125,189,181
S.E. type,hetero,hetero,hetero,hetero
R2,0.314,0.430,0.382,0.306
"Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)"


### Table 9.4 External validity – comparing dates 2

In [28]:
data["hotelcount"] = data.groupby("hotel_id")["city"].transform("count")


In [29]:
data["hotelcount"].value_counts().sort_index()


hotelcount
1      9
2     64
3    237
4    392
Name: count, dtype: int64

In [30]:
models = []
for date in dates:
    models.append(
         pf.feols("lnprice ~ lspline(distance,[2])",data=data.loc[lambda x: (x["date"] == date) & (x["hotelcount"] == 4)],vcov="HC1",context=0)
    )

In [31]:
pf.etable(
    models,
    model_heads=dates,
    labels={
        "Intercept": "Constant",
        "lspline(distance, [2])[0]": "Distance spline <2",
        "lspline(distance, [2])[1]": "Distance spline 2–7"
    },
)

Unnamed: 0_level_0,lnprice,lnprice,lnprice,lnprice
Unnamed: 0_level_1,2017-NOV-weekday,2017-NOV-weekend,2017-DEC-holiday,2018-JUNE-weekend
Unnamed: 0_level_2,(1),(2),(3),(4)
coef,coef,coef,coef,coef
Distance spline <2,-0.284*** (0.058),-0.445*** (0.055),-0.399*** (0.045),-0.283*** (0.053)
Distance spline 2–7,-0.033 (0.049),-0.016 (0.041),-0.009 (0.031),-0.025 (0.039)
Constant,5.024*** (0.068),5.520*** (0.069),5.193*** (0.067),5.116*** (0.078)
stats,stats,stats,stats,stats
Observations,98,98,98,98
S.E. type,hetero,hetero,hetero,hetero
R2,0.291,0.434,0.609,0.332
"Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)"


### Table 9.5 External validity – comparing cities

In [32]:
data = pd.read_csv(os.path.join(data_out, "hotels_work.csv"), index_col=0)


In [33]:
data = data.loc[
    lambda x: (x["stars"] >= 3)
    & (x["stars"] <= 4)
    & (x["date"] == "2017-NOV-weekday")
    & (x["accommodation_type"] == "Hotel")
]


In [34]:
pd.crosstab(index=data["city"], columns=data["stars"])


stars,3.0,3.5,4.0
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amsterdam,103,17,75
Barcelona,101,4,144
Vienna,82,14,111


In [35]:
data.groupby("stars")["distance"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3.0,286.0,1.36014,1.12706,0.1,0.6,1.0,1.8,6.6
3.5,35.0,1.654286,1.152578,0.1,0.9,1.6,2.05,5.3
4.0,330.0,1.381818,1.058173,0.0,0.6,1.1,1.8,6.0


In [36]:
data.groupby("price")["distance"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
50,1.0,3.90,,3.9,3.900,3.90,3.900,3.9
51,1.0,0.50,,0.5,0.500,0.50,0.500,0.5
52,2.0,2.25,1.202082,1.4,1.825,2.25,2.675,3.1
54,2.0,1.55,0.636396,1.1,1.325,1.55,1.775,2.0
56,1.0,3.70,,3.7,3.700,3.70,3.700,3.7
...,...,...,...,...,...,...,...,...
312,1.0,0.50,,0.5,0.500,0.50,0.500,0.5
324,1.0,0.30,,0.3,0.300,0.30,0.300,0.3
342,1.0,0.30,,0.3,0.300,0.30,0.300,0.3
383,1.0,1.90,,1.9,1.900,1.90,1.900,1.9


In [37]:
data.groupby("price")["distance"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
50,1.0,3.90,,3.9,3.900,3.90,3.900,3.9
51,1.0,0.50,,0.5,0.500,0.50,0.500,0.5
52,2.0,2.25,1.202082,1.4,1.825,2.25,2.675,3.1
54,2.0,1.55,0.636396,1.1,1.325,1.55,1.775,2.0
56,1.0,3.70,,3.7,3.700,3.70,3.700,3.7
...,...,...,...,...,...,...,...,...
312,1.0,0.50,,0.5,0.500,0.50,0.500,0.5
324,1.0,0.30,,0.3,0.300,0.30,0.300,0.3
342,1.0,0.30,,0.3,0.300,0.30,0.300,0.3
383,1.0,1.90,,1.9,1.900,1.90,1.900,1.9


In [38]:
cities = ["Vienna", "Amsterdam", "Barcelona"]


In [39]:
models = []
for city in cities:
    models.append(
         pf.feols("lnprice ~ lspline(distance,[2])",data=data.loc[lambda x: x["city"] == city],vcov="HC1",context=0)
    )

In [40]:
pf.etable(
    models,
    model_heads=cities,
    labels={
        "Intercept": "Constant",
        "lspline(distance, [2])[0]": "Distance spline <2",
        "lspline(distance, [2])[1]": "Distance spline 2–7"
    },
)

Unnamed: 0_level_0,lnprice,lnprice,lnprice
Unnamed: 0_level_1,Vienna,Amsterdam,Barcelona
Unnamed: 0_level_2,(1),(2),(3)
coef,coef,coef,coef
Distance spline <2,-0.309*** (0.038),-0.274*** (0.040),-0.057 (0.034)
Distance spline 2–7,0.024 (0.033),0.026 (0.037),-0.045 (0.058)
Constant,5.015*** (0.042),5.243*** (0.041),4.667*** (0.041)
stats,stats,stats,stats
Observations,207,195,249
S.E. type,hetero,hetero,hetero
R2,0.314,0.236,0.023
"Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)"


### Table 9.6 External validity – accommodation types

In [41]:
data = pd.read_csv(os.path.join(data_out, "hotels_work.csv"), index_col=0)


In [42]:
data = data.loc[
    lambda x: (x["stars"] >= 3)
    & (x["stars"] <= 4)
    & (x["date"] == "2017-NOV-weekday")
    & (x["city"] == "Vienna")
]


In [43]:
pd.crosstab(index=data["accommodation_type"], columns=data["stars"])


stars,3.0,3.5,4.0
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apartment,34,41,17
Hotel,82,14,111


In [44]:
data.groupby("stars")["distance"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3.0,116.0,1.850862,1.300468,0.1,0.9,1.55,2.3,6.9
3.5,55.0,1.372727,1.122242,0.1,0.35,1.4,1.75,5.1
4.0,128.0,1.303125,1.033908,0.0,0.5,1.0,1.7,4.8


In [45]:
data.groupby("price")["distance"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
50,1.0,3.90,,3.9,3.900,3.90,3.900,3.9
52,1.0,3.10,,3.1,3.100,3.10,3.100,3.1
54,2.0,1.55,0.636396,1.1,1.325,1.55,1.775,2.0
56,2.0,3.10,0.848528,2.5,2.800,3.10,3.400,3.7
58,2.0,1.40,0.000000,1.4,1.400,1.40,1.400,1.4
...,...,...,...,...,...,...,...,...
355,1.0,1.60,,1.6,1.600,1.60,1.600,1.6
363,1.0,1.50,,1.5,1.500,1.50,1.500,1.5
364,2.0,0.40,0.282843,0.2,0.300,0.40,0.500,0.6
383,1.0,1.90,,1.9,1.900,1.90,1.900,1.9


In [46]:
data.groupby("lnprice")["distance"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
lnprice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3.912023,1.0,3.90,,3.9,3.900,3.90,3.900,3.9
3.951244,1.0,3.10,,3.1,3.100,3.10,3.100,3.1
3.988984,2.0,1.55,0.636396,1.1,1.325,1.55,1.775,2.0
4.025352,2.0,3.10,0.848528,2.5,2.800,3.10,3.400,3.7
4.060443,2.0,1.40,0.000000,1.4,1.400,1.40,1.400,1.4
...,...,...,...,...,...,...,...,...
5.872118,1.0,1.60,,1.6,1.600,1.60,1.600,1.6
5.894403,1.0,1.50,,1.5,1.500,1.50,1.500,1.5
5.897154,2.0,0.40,0.282843,0.2,0.300,0.40,0.500,0.6
5.948035,1.0,1.90,,1.9,1.900,1.90,1.900,1.9


In [47]:
accommodation_types = ["Hotel", "Apartment"]


In [48]:
models = []
for atype in accommodation_types:
    models.append(
         pf.feols("lnprice ~ lspline(distance,[2])",data=data.loc[lambda x: x["accommodation_type"] == atype],vcov="HC1",context=0)
    )

In [49]:
pf.etable(
    models,
    model_heads=accommodation_types,
    labels={
        "Intercept": "Constant",
        "lspline(distance, [2])[0]": "Distance spline <2",
        "lspline(distance, [2])[1]": "Distance spline 2–7"
    },
)

Unnamed: 0_level_0,lnprice,lnprice
Unnamed: 0_level_1,Hotel,Apartment
Unnamed: 0_level_2,(1),(2)
coef,coef,coef
Distance spline <2,-0.309*** (0.038),-0.255*** (0.068)
Distance spline 2–7,0.024 (0.033),0.117* (0.047)
Constant,5.015*** (0.042),5.153*** (0.096)
stats,stats,stats
Observations,207,92
S.E. type,hetero,hetero
R2,0.314,0.134
"Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)","Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001. Format of coefficient cell: Coefficient (Std. Error)"
