#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 09
**CH09B How stable is the hotel price - distance to center relationship?**

using the hotels-europe dataset

version 1.0 2021-05-05

In [1]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
from patsy import bs,dmatrix,dmatrices
import sys
import statsmodels.api as sm

In [2]:
path = Path(os.getcwd())

In [3]:
base_dir = path.parent.parent

In [4]:
data_in = os.path.join(str(base_dir) , "da_data_repo/hotels-europe/clean/")

In [5]:
data_out = os.path.join(str(base_dir), "da_case_studies/ch09-hotels-europe-stability/")

In [6]:
func = os.path.join(str(base_dir) ,   "da_case_studies/ch00-tech-prep/")

In [7]:
sys.path.append(func)

In [8]:
from py_helper_functions import *

In [9]:
hotels_europe_price = pd.read_csv(os.path.join(data_in,"hotels-europe_price.csv"))

In [10]:
hotels_europe_features = pd.read_csv(os.path.join(data_in,"hotels-europe_features.csv"))

In [11]:
data = pd.merge(hotels_europe_price,hotels_europe_features,on='hotel_id',how='left')

In [12]:
data = data[data['city_actual'].isin(['Vienna','Amsterdam','Barcelona'])]

In [13]:
data = data[data['accommodation_type'].isin(['Hotel','Apartment'])]

In [14]:
data = data[data['nnights']!=4]

In [15]:
data = data[data['price']<1000]

In [16]:
data = data.drop_duplicates()

In [17]:
data.loc[(data['month']==11) & (data['weekend']==0),'date']='2017-NOV-weekday'
data.loc[(data['month']==11) & (data['weekend']==1),'date']='2017-NOV-weekend'
data.loc[(data['month']==12) & (data['holiday']==1),'date']='2017-DEC-holiday'
data.loc[(data['month']==6) & (data['weekend']==1),'date']='2018-JUNE-weekend'

In [18]:
data = data[data['date'].notna()]

In [19]:
data['city'].value_counts()

Barcelona    1564
Vienna       1326
Amsterdam     830
Name: city, dtype: int64

In [20]:
pd.crosstab(index=data['accommodation_type'], columns=data['city'])

city,Amsterdam,Barcelona,Vienna
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apartment,31,300,457
Hotel,799,1264,869


In [21]:
pd.crosstab(index=data['date'], columns=data['city'])

city,Amsterdam,Barcelona,Vienna
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-DEC-holiday,290,420,338
2017-NOV-weekday,315,452,377
2017-NOV-weekend,125,393,256
2018-JUNE-weekend,100,299,355


In [22]:
data['lnprice']=data['price'].map(lambda x:np.log(x))

In [23]:
data = data[["hotel_id", "date", "city", "accommodation_type", "stars", "rating", "distance", "price", "lnprice"]]

In [24]:
data.to_csv(os.path.join(data_out,"hotels_work.csv"),index=False)

In [25]:
data = data[(data['stars']>=3) & (data['stars']<=4)]

In [26]:
data = data[data['accommodation_type'] == 'Hotel'] 

In [27]:
data = data[data['city']=='Vienna']

In [28]:
data['date'].value_counts()

2017-NOV-weekday     207
2017-DEC-holiday     189
2018-JUNE-weekend    181
2017-NOV-weekend     125
Name: date, dtype: int64

In [29]:
data[['distance','price','lnprice']].describe()

Unnamed: 0,distance,price,lnprice
count,702.0,702.0,702.0
mean,1.566382,122.752137,4.737121
std,1.154614,53.30483,0.366648
min,0.0,50.0,3.912023
25%,0.8,86.0,4.454347
50%,1.4,109.0,4.691348
75%,1.9,144.0,4.969813
max,6.6,491.0,6.196444


In [30]:
data.groupby('date')['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,1.503175,1.059903,0.0,0.8,1.3,1.9,5.2
2017-NOV-weekday,207.0,1.529952,1.161507,0.0,0.8,1.3,1.9,6.6
2017-NOV-weekend,125.0,1.7728,1.298161,0.0,0.9,1.6,2.1,6.6
2018-JUNE-weekend,181.0,1.531492,1.13007,0.0,0.8,1.3,1.9,6.6


In [31]:
data.groupby('date')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,116.492063,46.308358,57.0,85.0,103.0,138.0,386.0
2017-NOV-weekday,207.0,109.975845,42.221381,50.0,82.0,100.0,129.5,383.0
2017-NOV-weekend,125.0,149.144,76.530903,60.0,92.0,132.0,180.0,491.0
2018-JUNE-weekend,181.0,125.674033,45.053534,59.0,94.0,111.0,154.0,297.0


In [32]:
data.groupby('date')['lnprice'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,4.69671,0.334883,4.043051,4.442651,4.634729,4.927254,5.955837
2017-NOV-weekday,207.0,4.640219,0.336751,3.912023,4.406719,4.60517,4.863673,5.948035
2017-NOV-weekend,125.0,4.902204,0.437582,4.094345,4.521789,4.882802,5.192957,6.196444
2018-JUNE-weekend,181.0,4.776133,0.334283,4.077537,4.543295,4.70953,5.036953,5.693732


In [33]:
## Regression with splines

In [34]:
y,X = dmatrices("lnprice ~ lspline(distance,2)",data[data.date=='2017-NOV-weekday'])

In [35]:
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.314
Model:                            OLS   Adj. R-squared:                  0.308
Method:                 Least Squares   F-statistic:                     46.79
Date:                Thu, 15 Apr 2021   Prob (F-statistic):           1.89e-17
Time:                        13:13:04   Log-Likelihood:                -28.843
No. Observations:                 207   AIC:                             63.69
Df Residuals:                     204   BIC:                             73.68
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [36]:
print(results.get_robustcov_results(cov_type='HC1').summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.314
Model:                            OLS   Adj. R-squared:                  0.308
Method:                 Least Squares   F-statistic:                     46.04
Date:                Thu, 15 Apr 2021   Prob (F-statistic):           3.16e-17
Time:                        13:14:33   Log-Likelihood:                -28.843
No. Observations:                 207   AIC:                             63.69
Df Residuals:                     204   BIC:                             73.68
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [37]:
dates = data['date'].unique().tolist()

In [38]:
dates.remove('2017-NOV-weekday')

In [39]:
coefs = []
for date in dates:
    y,X = dmatrices("lnprice ~ lspline(distance,2)",data[data.date==date])
    model = sm.OLS(y, X).fit()
    coefs.append(model.get_robustcov_results(cov_type='HC1').params)
    

In [40]:
pd.DataFrame(np.stack(coefs),index=dates,columns=['intercept','dist_0_2','dist_2_7'])

Unnamed: 0,intercept,dist_0_2,dist_2_7
2017-DEC-holiday,5.133454,-0.362366,0.069913
2018-JUNE-weekend,5.159441,-0.312674,0.037415
2017-NOV-weekend,5.507119,-0.444363,-0.004813


In [41]:
data['hotelcount']=data.groupby('hotel_id')['city'].transform('count')

In [42]:
y,X = dmatrices("lnprice ~ lspline(distance,2)",
                data[(data['date']=='2017-NOV-weekday') & (data['hotelcount']==4)])
model = sm.OLS(y, X).fit()
print(model.get_robustcov_results(cov_type='HC1').summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.291
Model:                            OLS   Adj. R-squared:                  0.276
Method:                 Least Squares   F-statistic:                     21.57
Date:                Thu, 15 Apr 2021   Prob (F-statistic):           1.89e-08
Time:                        13:14:44   Log-Likelihood:                -17.360
No. Observations:                  98   AIC:                             40.72
Df Residuals:                      95   BIC:                             48.47
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [43]:
coefs = []
for date in dates:
    y,X = dmatrices("lnprice ~ lspline(distance,2)",
                    data[(data['date']==date)&(data['hotelcount']==4)])
    model = sm.OLS(y, X).fit()
    coefs.append(model.get_robustcov_results(cov_type='HC1').params)

In [44]:
pd.DataFrame(np.stack(coefs),index=dates,columns=['intercept','dist_0_2','dist_2_7'])

Unnamed: 0,intercept,dist_0_2,dist_2_7
2017-DEC-holiday,5.193115,-0.398839,-0.008673
2018-JUNE-weekend,5.116127,-0.282597,-0.025418
2017-NOV-weekend,5.519827,-0.444645,-0.016016


In [45]:
data = pd.read_csv(os.path.join(data_out,"hotels_work.csv"))

In [46]:
data

Unnamed: 0.1,Unnamed: 0,hotel_id,date,city,accommodation_type,stars,rating,distance,price,lnprice
0,1,1,2017-NOV-weekend,Amsterdam,Hotel,4.0,4.3,3.1,172,5.147494
1,2,1,2017-DEC-holiday,Amsterdam,Hotel,4.0,4.3,3.1,122,4.804021
2,3,1,2017-NOV-weekday,Amsterdam,Hotel,4.0,4.3,3.1,114,4.736198
3,4,3,2017-DEC-holiday,Amsterdam,Hotel,4.0,4.1,1.5,118,4.770685
4,5,3,2017-NOV-weekend,Amsterdam,Hotel,4.0,4.1,1.5,217,5.379897
...,...,...,...,...,...,...,...,...,...,...
3715,3716,22408,2017-DEC-holiday,Vienna,Hotel,3.0,3.2,1.4,68,4.219508
3716,3717,22408,2017-NOV-weekday,Vienna,Hotel,3.0,3.2,1.4,58,4.060443
3717,3718,22409,2018-JUNE-weekend,Vienna,Apartment,3.5,4.0,0.7,130,4.867534
3718,3719,22409,2017-DEC-holiday,Vienna,Apartment,3.5,4.0,0.7,161,5.081404


In [47]:
data = data.loc[(data['stars']>=3) &
(data['stars']<=4) &
(data['city']=="Vienna") &
(data['date']=="2017-NOV-weekday"),:]

In [48]:
pd.crosstab(index=data['accommodation_type'], columns=data['stars'])

stars,3.0,3.5,4.0
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apartment,34,41,17
Hotel,82,14,111


In [49]:
data.groupby('stars')['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3.0,116.0,1.850862,1.300468,0.1,0.9,1.55,2.3,6.9
3.5,55.0,1.372727,1.122242,0.1,0.35,1.4,1.75,5.1
4.0,128.0,1.303125,1.033908,0.0,0.5,1.0,1.7,4.8


In [50]:
data.groupby('price')['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
50,1.0,3.90,,3.9,3.900,3.90,3.900,3.9
52,1.0,3.10,,3.1,3.100,3.10,3.100,3.1
54,2.0,1.55,0.636396,1.1,1.325,1.55,1.775,2.0
56,2.0,3.10,0.848528,2.5,2.800,3.10,3.400,3.7
58,2.0,1.40,0.000000,1.4,1.400,1.40,1.400,1.4
...,...,...,...,...,...,...,...,...
355,1.0,1.60,,1.6,1.600,1.60,1.600,1.6
363,1.0,1.50,,1.5,1.500,1.50,1.500,1.5
364,2.0,0.40,0.282843,0.2,0.300,0.40,0.500,0.6
383,1.0,1.90,,1.9,1.900,1.90,1.900,1.9


In [51]:
data.groupby('lnprice')['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
lnprice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3.912023,1.0,3.90,,3.9,3.900,3.90,3.900,3.9
3.951244,1.0,3.10,,3.1,3.100,3.10,3.100,3.1
3.988984,2.0,1.55,0.636396,1.1,1.325,1.55,1.775,2.0
4.025352,2.0,3.10,0.848528,2.5,2.800,3.10,3.400,3.7
4.060443,2.0,1.40,0.000000,1.4,1.400,1.40,1.400,1.4
...,...,...,...,...,...,...,...,...
5.872118,1.0,1.60,,1.6,1.600,1.60,1.600,1.6
5.894403,1.0,1.50,,1.5,1.500,1.50,1.500,1.5
5.897154,2.0,0.40,0.282843,0.2,0.300,0.40,0.500,0.6
5.948035,1.0,1.90,,1.9,1.900,1.90,1.900,1.9


In [52]:
data

Unnamed: 0.1,Unnamed: 0,hotel_id,date,city,accommodation_type,stars,rating,distance,price,lnprice
2394,2395,21894,2017-NOV-weekday,Vienna,Apartment,4.0,4.4,2.7,81,4.394449
2400,2401,21897,2017-NOV-weekday,Vienna,Hotel,4.0,3.9,1.7,81,4.394449
2408,2409,21901,2017-NOV-weekday,Vienna,Hotel,4.0,3.7,1.4,85,4.442651
2413,2414,21902,2017-NOV-weekday,Vienna,Hotel,3.0,4.0,1.7,83,4.418841
2417,2418,21903,2017-NOV-weekday,Vienna,Hotel,4.0,3.9,1.2,82,4.406719
...,...,...,...,...,...,...,...,...,...,...
3699,3700,22403,2017-NOV-weekday,Vienna,Hotel,3.0,3.4,1.5,73,4.290459
3705,3706,22404,2017-NOV-weekday,Vienna,Apartment,3.0,5.0,1.5,109,4.691348
3712,3713,22407,2017-NOV-weekday,Vienna,Hotel,4.0,4.4,1.0,100,4.605170
3716,3717,22408,2017-NOV-weekday,Vienna,Hotel,3.0,3.2,1.4,58,4.060443


In [53]:
y,X = dmatrices("lnprice ~ lspline(distance,2)",
                data[(data['accommodation_type']=='Hotel')])
model = sm.OLS(y, X).fit()
print(model.get_robustcov_results(cov_type='HC1').summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.314
Model:                            OLS   Adj. R-squared:                  0.308
Method:                 Least Squares   F-statistic:                     46.04
Date:                Thu, 15 Apr 2021   Prob (F-statistic):           3.16e-17
Time:                        13:15:18   Log-Likelihood:                -28.843
No. Observations:                 207   AIC:                             63.69
Df Residuals:                     204   BIC:                             73.68
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [54]:
y,X = dmatrices("lnprice ~ lspline(distance,2)",
                data[(data['accommodation_type']=='Apartment')])
model = sm.OLS(y, X).fit()
print(model.get_robustcov_results(cov_type='HC1').summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.134
Model:                            OLS   Adj. R-squared:                  0.114
Method:                 Least Squares   F-statistic:                     7.597
Date:                Thu, 15 Apr 2021   Prob (F-statistic):           0.000899
Time:                        13:15:32   Log-Likelihood:                -48.775
No. Observations:                  92   AIC:                             103.5
Df Residuals:                      89   BIC:                             111.1
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 