In [27]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib import pyplot as plt

%matplotlib inline


  from pandas.core import datetools


In [80]:
df = pd.read_csv('data/listing_formatted_hr.csv', index_col=0)
X_internal = ['wifi', 'AC', 'kitchen', 'free_parking', 'smoking_ok',
       'breakfast', 'heating', 'fam_friendly', 'laundry', 'essentials',
       'workspace', 'bedding',  'utensils',
       'backyard', 'long_stay', 'is_house']
Y = ['hit_rate']
X_data = df[X_internal]
Y_data = df[Y]

  interactivity=interactivity, compiler=compiler, result=result)


## Regression On all Variables

In [76]:
model = sm.OLS(Y_data,sm.add_constant(X_data))
olsres = model.fit()
print(olsres.summary())

                            OLS Regression Results                            
Dep. Variable:               hit_rate   R-squared:                       0.034
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     122.7
Date:                Sat, 09 Sep 2017   Prob (F-statistic):               0.00
Time:                        11:30:56   Log-Likelihood:                -25214.
No. Observations:               56417   AIC:                         5.046e+04
Df Residuals:                   56400   BIC:                         5.061e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.5662      0.008     69.231   

## New Features after removing the non-significant variables

In [77]:
X_internal_new = ['wifi', 'AC', 'kitchen', 'free_parking', 'smoking_ok',
       'heating', 'fam_friendly', 'laundry', 'essentials',
       'workspace', 'is_house']
X_data_new = df[X_internal_new]

In [79]:
model = sm.OLS(Y_data,sm.add_constant(X_data_new))
olsres = model.fit()
print(olsres.summary())

                            OLS Regression Results                            
Dep. Variable:               hit_rate   R-squared:                       0.034
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     177.8
Date:                Sat, 09 Sep 2017   Prob (F-statistic):               0.00
Time:                        11:31:22   Log-Likelihood:                -25218.
No. Observations:               56417   AIC:                         5.046e+04
Df Residuals:                   56405   BIC:                         5.057e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.5662      0.008     69.235   

## Regression on the variables that is not monotone for all listings

In [84]:
X_unimportant = ['wifi','bedding','utensils','backyard','long_stay']
X_important = ['AC', 'kitchen', 'free_parking', 'smoking_ok',
       'breakfast', 'heating', 'fam_friendly', 'laundry', 'essentials','workspace', 'is_house']
X_data_imp = df[X_important]
Y_data = df[Y]

In [85]:
model = sm.OLS(Y_data,sm.add_constant(X_data_imp))
olsres = model.fit()
print(olsres.summary())

                            OLS Regression Results                            
Dep. Variable:               hit_rate   R-squared:                       0.033
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     176.8
Date:                Sat, 09 Sep 2017   Prob (F-statistic):               0.00
Time:                        11:33:09   Log-Likelihood:                -25223.
No. Observations:               56417   AIC:                         5.047e+04
Df Residuals:                   56405   BIC:                         5.058e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.5481      0.006     84.623   

## Volatilities

In [86]:
df = pd.read_csv('data/listing_formatted_hr_vol.csv', index_col=0)
df_cleaned = df.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [87]:
X_important = ['AC', 'kitchen', 'free_parking', 'smoking_ok',
       'breakfast', 'heating', 'fam_friendly', 'laundry', 'essentials','workspace', 'is_house']

Y_vol = ['volatility']
X_data = df_cleaned[X_important]
Y_vol_data = df_cleaned[Y_vol]

In [88]:
model = sm.OLS(Y_vol_data, sm.add_constant(X_data))
olsres = model.fit()
print(olsres.summary())

                            OLS Regression Results                            
Dep. Variable:             volatility   R-squared:                       0.087
Model:                            OLS   Adj. R-squared:                  0.083
Method:                 Least Squares   F-statistic:                     18.72
Date:                Sat, 09 Sep 2017   Prob (F-statistic):           3.10e-36
Time:                        11:34:04   Log-Likelihood:                -9164.1
No. Observations:                2161   AIC:                         1.835e+04
Df Residuals:                    2149   BIC:                         1.842e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.2401      2.283      0.105   

## Regression on Volatilities after removing the insignificant variables

In [92]:
X_important_cleaned = ['AC', 'kitchen', 
       'breakfast', 'fam_friendly', 'essentials','workspace']

Y_vol = ['volatility']
X_data = df_cleaned[X_important_cleaned]
Y_vol_data = df_cleaned[Y_vol]

model = sm.OLS(Y_vol_data, X_data)
olsres = model.fit()
print(olsres.summary())


                            OLS Regression Results                            
Dep. Variable:             volatility   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.323
Method:                 Least Squares   F-statistic:                     172.6
Date:                Sat, 09 Sep 2017   Prob (F-statistic):          1.54e-179
Time:                        11:36:26   Log-Likelihood:                -9166.9
No. Observations:                2161   AIC:                         1.835e+04
Df Residuals:                    2155   BIC:                         1.838e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
AC               6.5973      1.025      6.438   