In [1]:
import pandas as pd
import os
from pathlib import Path
import numpy as np

In [2]:
path = Path(os.getcwd())

In [3]:
base_dir = path.parent.parent

In [4]:
data_in = os.path.join(str(base_dir) , "da_data_repo/hotels-europe/clean/")

In [5]:
data_out = os.path.join(str(base_dir), "da_case_studies/ch09-hotels-europe-stability/")

In [6]:
hotels_europe_price = pd.read_csv(os.path.join(data_in,"hotels-europe_price.csv"))

In [7]:
hotels_europe_features = pd.read_csv(os.path.join(data_in,"hotels-europe_features.csv"))

In [8]:
data = pd.merge(hotels_europe_price,hotels_europe_features,on='hotel_id',how='left')

In [9]:
data = data[data['city_actual'].isin(['Vienna','Amsterdam','Barcelona'])]

In [10]:
data = data[data['accommodation_type'].isin(['Hotel','Apartment'])]

In [11]:
data = data[data['nnights']!=4]

In [12]:
data = data[data['price']<1000]

In [13]:
data = data.drop_duplicates()

In [14]:
data.loc[(data['month']==11) & (data['weekend']==0),'date']='2017-NOV-weekday'
data.loc[(data['month']==11) & (data['weekend']==1),'date']='2017-NOV-weekend'
data.loc[(data['month']==12) & (data['holiday']==1),'date']='2017-DEC-holiday'
data.loc[(data['month']==6) & (data['weekend']==1),'date']='2018-JUNE-weekend'

In [15]:
data['date']

0          2017-NOV-weekend
1                       NaN
2          2017-DEC-holiday
4                       NaN
5          2017-NOV-weekday
9          2017-DEC-holiday
10         2017-NOV-weekend
11                      NaN
13         2017-NOV-weekday
14         2017-DEC-holiday
15         2017-NOV-weekday
16                      NaN
17                      NaN
18         2017-NOV-weekend
19                      NaN
20                      NaN
21         2017-NOV-weekday
22                      NaN
24        2018-JUNE-weekend
25         2017-DEC-holiday
26                      NaN
27         2017-NOV-weekday
28         2017-DEC-holiday
29                      NaN
30                      NaN
31         2017-NOV-weekend
33                      NaN
34         2017-NOV-weekday
35         2017-DEC-holiday
36                      NaN
                ...        
144073     2017-NOV-weekend
144074                  NaN
144075                  NaN
144076     2017-NOV-weekday
144077              

In [16]:
data = data[data['date'].notna()]

In [17]:
data['city'].value_counts()

Barcelona    1564
Vienna       1326
Amsterdam     830
Name: city, dtype: int64

In [18]:
pd.crosstab(index=data['accommodation_type'], columns=data['city'])

city,Amsterdam,Barcelona,Vienna
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apartment,31,300,457
Hotel,799,1264,869


In [19]:
pd.crosstab(index=data['date'], columns=data['city'])

city,Amsterdam,Barcelona,Vienna
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-DEC-holiday,290,420,338
2017-NOV-weekday,315,452,377
2017-NOV-weekend,125,393,256
2018-JUNE-weekend,100,299,355


In [20]:
data['lnprice']=data['price'].map(lambda x:np.log(x))

In [21]:
data = data[["hotel_id", "date", "city", "accommodation_type", "stars", "rating", "distance", "price", "lnprice"]]

In [22]:
data.shape

(3720, 9)

In [23]:
data.to_csv(os.path.join(data_out,"hotels_work.csv"),index=False)

In [24]:
data = data[(data['stars']>=3) & (data['stars']<=4)]

In [25]:
data.shape

(2800, 9)

In [26]:
data = data[data['accommodation_type'] == 'Hotel'] 

In [27]:
data.shape

(2153, 9)

In [28]:
data = data[data['city']=='Vienna']

In [29]:
data.shape

(702, 9)

In [30]:
data['date'].value_counts()

2017-NOV-weekday     207
2017-DEC-holiday     189
2018-JUNE-weekend    181
2017-NOV-weekend     125
Name: date, dtype: int64

In [31]:
data['distance'].describe()

count    702.000000
mean       1.566382
std        1.154614
min        0.000000
25%        0.800000
50%        1.400000
75%        1.900000
max        6.600000
Name: distance, dtype: float64

In [32]:
data['price'].describe()

count    702.000000
mean     122.752137
std       53.304830
min       50.000000
25%       86.000000
50%      109.000000
75%      144.000000
max      491.000000
Name: price, dtype: float64

In [33]:
data['lnprice'].describe()

count    702.000000
mean       4.737121
std        0.366648
min        3.912023
25%        4.454347
50%        4.691348
75%        4.969813
max        6.196444
Name: lnprice, dtype: float64

In [34]:
data.groupby('date')['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,1.503175,1.059903,0.0,0.8,1.3,1.9,5.2
2017-NOV-weekday,207.0,1.529952,1.161507,0.0,0.8,1.3,1.9,6.6
2017-NOV-weekend,125.0,1.7728,1.298161,0.0,0.9,1.6,2.1,6.6
2018-JUNE-weekend,181.0,1.531492,1.13007,0.0,0.8,1.3,1.9,6.6


In [35]:
data.groupby('date')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,116.492063,46.308358,57.0,85.0,103.0,138.0,386.0
2017-NOV-weekday,207.0,109.975845,42.221381,50.0,82.0,100.0,129.5,383.0
2017-NOV-weekend,125.0,149.144,76.530903,60.0,92.0,132.0,180.0,491.0
2018-JUNE-weekend,181.0,125.674033,45.053534,59.0,94.0,111.0,154.0,297.0


In [36]:
data.groupby('date')['lnprice'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-DEC-holiday,189.0,4.69671,0.334883,4.043051,4.442651,4.634729,4.927254,5.955837
2017-NOV-weekday,207.0,4.640219,0.336751,3.912023,4.406719,4.60517,4.863673,5.948035
2017-NOV-weekend,125.0,4.902204,0.437582,4.094345,4.521789,4.882802,5.192957,6.196444
2018-JUNE-weekend,181.0,4.776133,0.334283,4.077537,4.543295,4.70953,5.036953,5.693732


In [37]:
## median isnt there is this a problem?

In [38]:
## Regression with splines should go here

In [39]:
data = pd.read_csv(os.path.join(data_out,"hotels_work.csv"))

In [40]:
data.head()

Unnamed: 0,hotel_id,date,city,accommodation_type,stars,rating,distance,price,lnprice
0,1,2017-NOV-weekend,Amsterdam,Hotel,4.0,4.3,3.1,172,5.147494
1,1,2017-DEC-holiday,Amsterdam,Hotel,4.0,4.3,3.1,122,4.804021
2,1,2017-NOV-weekday,Amsterdam,Hotel,4.0,4.3,3.1,114,4.736198
3,3,2017-DEC-holiday,Amsterdam,Hotel,4.0,4.1,1.5,118,4.770685
4,3,2017-NOV-weekend,Amsterdam,Hotel,4.0,4.1,1.5,217,5.379897


In [41]:
data = data[(data['stars']>=3) & (data['stars']<=4)]

In [42]:
data = data[data['accommodation_type'] == 'Hotel'] 

In [43]:
data = data[data['date']=="2017-NOV-weekday"]

In [44]:
data.groupby('city')['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amsterdam,195.0,1.473846,1.286281,0.1,0.55,1.0,1.9,6.0
Barcelona,249.0,1.2,0.815673,0.1,0.6,1.0,1.7,4.6
Vienna,207.0,1.529952,1.161507,0.0,0.8,1.3,1.9,6.6


In [46]:
data.groupby('city')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amsterdam,195.0,148.317949,63.153237,63.0,110.0,134.0,172.0,690.0
Barcelona,249.0,103.97992,33.027902,51.0,81.0,98.0,118.0,264.0
Vienna,207.0,109.975845,42.221381,50.0,82.0,100.0,129.5,383.0


In [47]:
data.groupby('city')['lnprice'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amsterdam,195.0,4.935041,0.344122,4.143135,4.70048,4.89784,5.147494,6.536692
Barcelona,249.0,4.600394,0.290127,3.931826,4.394449,4.584967,4.770685,5.575949
Vienna,207.0,4.640219,0.336751,3.912023,4.406719,4.60517,4.863673,5.948035


In [48]:
## add in spline regression

In [49]:
data = pd.read_csv(os.path.join(data_out,"hotels_work.csv"))

In [50]:
data = data[(data['stars']>=3) & (data['stars']<=4)]

In [51]:
data = data[data['city'] == 'Vienna'] 

In [52]:
data = data[data['date']=="2017-NOV-weekday"]

In [53]:
pd.crosstab(index=data['accommodation_type'], columns=data['stars'])

stars,3.0,3.5,4.0
accommodation_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apartment,34,41,17
Hotel,82,14,111


In [55]:
data.groupby('stars')['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3.0,116.0,1.850862,1.300468,0.1,0.9,1.55,2.3,6.9
3.5,55.0,1.372727,1.122242,0.1,0.35,1.4,1.75,5.1
4.0,128.0,1.303125,1.033908,0.0,0.5,1.0,1.7,4.8


In [56]:
data.groupby('stars')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3.0,116.0,101.508621,42.218581,50.0,77.0,88.0,113.0,383.0
3.5,55.0,143.072727,84.948973,56.0,91.5,116.0,153.0,511.0
4.0,128.0,128.929688,51.467318,60.0,94.5,117.0,151.25,364.0


In [57]:
data.groupby('stars')['lnprice'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3.0,116.0,4.557955,0.336048,3.912023,4.343805,4.477337,4.727388,5.948035
3.5,55.0,4.843521,0.459908,4.025352,4.516324,4.75359,5.030246,6.23637
4.0,128.0,4.79388,0.352248,4.094345,4.548558,4.762174,5.01893,5.897154


In [58]:
#regression with splines should go here