In [36]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
path = Path(os.getcwd())

In [3]:
base_dir = path.parent.parent

In [4]:
data_in = os.path.join(str(base_dir) , "da_data_repo/hotels-vienna/clean/")

In [5]:
data_out = os.path.join(str(base_dir), "da_case_studies/ch10-hotels-multiple-reg/")

In [7]:
hotels = pd.read_csv(os.path.join(data_in,"hotels-vienna.csv"))

In [14]:
hotels = hotels.query('accommodation_type=="Hotel"'). \
       query('city_actual=="Vienna"'). \
       query('stars>=3 & stars<=4'). \
       query('price<=600')

In [17]:
hotels = hotels[hotels['stars'].notnull()]

In [18]:
len(hotels)

207

In [20]:
hotels['lnprice']=hotels['price'].map(lambda x:np.log(x))

In [21]:
hotels['distance2']=hotels['distance']

In [24]:
hotels.loc[hotels['distance2']<0.05,'distance2']=0.05

In [26]:
hotels['lndistance']=hotels['distance2'].map(lambda x:np.log(x))

In [31]:
hotels['star35']=(hotels['stars']==3.5).map(int)

In [32]:
hotels['star4']=(hotels['stars']==4).map(int)

In [33]:
hotels['price'].describe()

count    207.000000
mean     109.975845
std       42.221381
min       50.000000
25%       82.000000
50%      100.000000
75%      129.500000
max      383.000000
Name: price, dtype: float64

In [34]:
hotels['distance'].describe()

count    207.000000
mean       1.529952
std        1.161507
min        0.000000
25%        0.800000
50%        1.300000
75%        1.900000
max        6.600000
Name: distance, dtype: float64

In [35]:
hotels['lnprice'].describe()

count    207.000000
mean       4.640219
std        0.336751
min        3.912023
25%        4.406719
50%        4.605170
75%        4.863673
max        5.948035
Name: lnprice, dtype: float64

In [44]:
reg0 = smf.ols('lnprice ~ rating', data=hotels).fit()
reg1 = smf.ols('lnprice ~ distance', data=hotels).fit()
reg2 = smf.ols('lnprice ~ distance + rating', data=hotels).fit()

In [45]:
results

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x12550e160>

In [46]:
reg0.get_robustcov_results(cov_type='HC1')
reg1.get_robustcov_results(cov_type='HC1')
reg2.get_robustcov_results(cov_type='HC1')

<statsmodels.regression.linear_model.OLSResults at 0x12579ceb8>

In [47]:
print(reg0.summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.252
Model:                            OLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                     69.11
Date:                Fri, 19 Jun 2020   Prob (F-statistic):           1.28e-14
Time:                        14:42:40   Log-Likelihood:                -37.850
No. Observations:                 207   AIC:                             79.70
Df Residuals:                     205   BIC:                             86.37
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.8460      0.217     13.128      0.0

In [48]:
print(reg1.summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.205
Model:                            OLS   Adj. R-squared:                  0.201
Method:                 Least Squares   F-statistic:                     52.90
Date:                Fri, 19 Jun 2020   Prob (F-statistic):           7.30e-12
Time:                        14:42:56   Log-Likelihood:                -44.160
No. Observations:                 207   AIC:                             92.32
Df Residuals:                     205   BIC:                             98.99
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.8411      0.035    139.720      0.0

In [49]:
print(reg2.summary())

                            OLS Regression Results                            
Dep. Variable:                lnprice   R-squared:                       0.365
Model:                            OLS   Adj. R-squared:                  0.359
Method:                 Least Squares   F-statistic:                     58.69
Date:                Fri, 19 Jun 2020   Prob (F-statistic):           7.34e-21
Time:                        14:43:05   Log-Likelihood:                -20.875
No. Observations:                 207   AIC:                             47.75
Df Residuals:                     204   BIC:                             57.75
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.3160      0.215     15.435      0.0