In [1]:
# Libraries set-up

%load_ext lab_black
import os
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [2]:
# Import data

data_2021 = pd.read_csv("../data/data_2021_distance.csv")

In [3]:
# Keep interesting subset of columns for all analyses, drop empty rows

data = data_2021[
    [
        "price",
        "new_neighbourhood",
        "neighbourhood_group",
        "room_type",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "number_of_reviews_ltm",
        "station_dist",
        "station_dist2",
        "park_dist",
        "park_dist2",
    ]
]
data = data.dropna()

In [4]:
# Set up price as Y (dependent variable) for all regressions

Y = data["price"]

In [5]:
# Set up data for simple linear regression of price on neighborhood group, Bronx used as baseline

datang = data[
    [
        "price",
        "neighbourhood_group",
    ]
]
datang = pd.get_dummies(datang, columns=["neighbourhood_group"])
datang.drop(["neighbourhood_group_Bronx"], axis=1, inplace=True)
datang.astype(float)

Xng = datang
datang.drop(["price"], axis=1, inplace=True)
XngC = sm.add_constant(Xng)

In [6]:
# Run Linear regression of price on neighborhood group dummies

estng = sm.OLS(Y, XngC)
est_ng = estng.fit()
est_ng.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.016
Method:,Least Squares,F-statistic:,110.6
Date:,"Thu, 12 May 2022",Prob (F-statistic):,1.16e-93
Time:,20:29:08,Log-Likelihood:,-193480.0
No. Observations:,27627,AIC:,387000.0
Df Residuals:,27622,BIC:,387000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,94.8289,9.515,9.966,0.000,76.178,113.480
neighbourhood_group_Brooklyn,43.7167,9.841,4.442,0.000,24.427,63.006
neighbourhood_group_Manhattan,99.6544,9.832,10.135,0.000,80.382,118.926
neighbourhood_group_Queens,17.6429,10.458,1.687,0.092,-2.855,38.141
neighbourhood_group_Staten Island,8.3145,18.740,0.444,0.657,-28.417,45.046

0,1,2,3
Omnibus:,63756.943,Durbin-Watson:,1.847
Prob(Omnibus):,0.0,Jarque-Bera (JB):,641016693.598
Skew:,22.579,Prob(JB):,0.0
Kurtosis:,747.864,Cond. No.,17.0


In [7]:
# Set up data for simple linear regression of price on neighborhood, Midtown used as baseline

datan = data[
    [
        "price",
        "new_neighbourhood",
    ]
]
datan = pd.get_dummies(datan, columns=["new_neighbourhood"])
datan.drop(["new_neighbourhood_Greenpoint"], axis=1, inplace=True)
datan.astype(float)

Xn = datan
datan.drop(["price"], axis=1, inplace=True)
XnC = sm.add_constant(Xn)

In [8]:
# Run Linear regression of price on neighborhood group dummies

estn = sm.OLS(Y, XnC)
est_n = estn.fit()
est_n.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.045
Model:,OLS,Adj. R-squared:,0.038
Method:,Least Squares,F-statistic:,5.83
Date:,"Thu, 12 May 2022",Prob (F-statistic):,6.659999999999999e-147
Time:,20:29:09,Log-Likelihood:,-193060.0
No. Observations:,27627,AIC:,386600.0
Df Residuals:,27403,BIC:,388400.0
Df Model:,223,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,187.1439,11.309,16.548,0.000,164.978,209.310
new_neighbourhood_Allerton,-78.1439,84.022,-0.930,0.352,-242.832,86.544
new_neighbourhood_Annadale-Huguenot-Prince's Bay-Woodrow,-66.8106,152.427,-0.438,0.661,-365.575,231.954
new_neighbourhood_Arden Heights-Rossville,-87.6439,132.127,-0.663,0.507,-346.619,171.331
new_neighbourhood_Astoria (Central),-54.2719,19.177,-2.830,0.005,-91.859,-16.684
new_neighbourhood_Astoria (East)-Woodside (North),-103.0128,22.510,-4.576,0.000,-147.133,-58.893
new_neighbourhood_Astoria (North)-Ditmars-Steinway,-89.3967,22.745,-3.930,0.000,-133.978,-44.816
new_neighbourhood_Astoria Park,-86.1439,100.152,-0.860,0.390,-282.448,110.160
new_neighbourhood_Auburndale,-85.6439,57.260,-1.496,0.135,-197.877,26.589

0,1,2,3
Omnibus:,64464.762,Durbin-Watson:,1.851
Prob(Omnibus):,0.0,Jarque-Bera (JB):,696646043.292
Skew:,23.183,Prob(JB):,0.0
Kurtosis:,779.555,Cond. No.,171.0


In [9]:
# Neighbourhoods specifically seem to better predict price than boroughs (groups). We now do the same process adding other room type
# dummies, total reviews, reviews in the previous month, average reviews per month, and host total listings

data3 = data[
    [
        "price",
        "new_neighbourhood",
        "room_type",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "number_of_reviews_ltm",
    ]
]
data3 = pd.get_dummies(data3, columns=["new_neighbourhood"])
data3.drop(["new_neighbourhood_Greenpoint"], axis=1, inplace=True)
data3 = pd.get_dummies(data3, columns=["room_type"])
data3.drop(["room_type_Private room"], axis=1, inplace=True)
data3.astype(float)

X3 = data3
data3.drop(["price"], axis=1, inplace=True)
X3C = sm.add_constant(X3)

est3 = sm.OLS(Y, X3C)
est_3 = est3.fit()
est_3.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.061
Method:,Least Squares,F-statistic:,8.87
Date:,"Thu, 12 May 2022",Prob (F-statistic):,9.88e-275
Time:,20:29:10,Log-Likelihood:,-192710.0
No. Observations:,27627,AIC:,385900.0
Df Residuals:,27396,BIC:,387800.0
Df Model:,230,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,133.2739,11.417,11.673,0.000,110.896,155.651
number_of_reviews,-0.1243,0.034,-3.701,0.000,-0.190,-0.058
reviews_per_month,0.2266,0.497,0.456,0.648,-0.747,1.200
calculated_host_listings_count,-0.1121,0.064,-1.739,0.082,-0.239,0.014
number_of_reviews_ltm,0.1883,0.140,1.340,0.180,-0.087,0.464
new_neighbourhood_Allerton,-72.9311,82.984,-0.879,0.379,-235.584,89.722
new_neighbourhood_Annadale-Huguenot-Prince's Bay-Woodrow,-63.6117,150.534,-0.423,0.673,-358.667,231.443
new_neighbourhood_Arden Heights-Rossville,-100.1848,130.494,-0.768,0.443,-355.960,155.590
new_neighbourhood_Astoria (Central),-32.3586,18.961,-1.707,0.088,-69.524,4.807

0,1,2,3
Omnibus:,65466.492,Durbin-Watson:,1.846
Prob(Omnibus):,0.0,Jarque-Bera (JB):,774902387.259
Skew:,24.068,Prob(JB):,0.0
Kurtosis:,822.056,Cond. No.,10800.0


In [10]:
# Finally, we add to the former regression two regressors: distance from nearest park. We first do this using linear distance and
# second by using taxi distance and choose which is better

In [11]:
# Using linear distance

data4 = data[
    [
        "price",
        "new_neighbourhood",
        "room_type",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "number_of_reviews_ltm",
        "station_dist",
        "park_dist",
    ]
]
data4 = pd.get_dummies(data4, columns=["new_neighbourhood"])
data4.drop(["new_neighbourhood_Greenpoint"], axis=1, inplace=True)
data4 = pd.get_dummies(data4, columns=["room_type"])
data4.drop(["room_type_Private room"], axis=1, inplace=True)
data4.astype(float)

X4 = data4
data4.drop(["price"], axis=1, inplace=True)
X4C = sm.add_constant(X4)

est4 = sm.OLS(Y, X4C)
est_4 = est4.fit()
est_4.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.062
Method:,Least Squares,F-statistic:,8.81
Date:,"Thu, 12 May 2022",Prob (F-statistic):,1.69e-274
Time:,20:29:11,Log-Likelihood:,-192710.0
No. Observations:,27627,AIC:,385900.0
Df Residuals:,27394,BIC:,387800.0
Df Model:,232,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,131.4362,12.193,10.780,0.000,107.538,155.335
number_of_reviews,-0.1252,0.034,-3.729,0.000,-0.191,-0.059
reviews_per_month,0.2258,0.497,0.455,0.649,-0.748,1.200
calculated_host_listings_count,-0.1126,0.064,-1.746,0.081,-0.239,0.014
number_of_reviews_ltm,0.1896,0.140,1.350,0.177,-0.086,0.465
station_dist,-9.4855,11.135,-0.852,0.394,-31.310,12.339
park_dist,15.3061,9.231,1.658,0.097,-2.787,33.399
new_neighbourhood_Allerton,-72.7040,82.982,-0.876,0.381,-235.352,89.944
new_neighbourhood_Annadale-Huguenot-Prince's Bay-Woodrow,26.8717,181.996,0.148,0.883,-329.851,383.594

0,1,2,3
Omnibus:,65463.809,Durbin-Watson:,1.846
Prob(Omnibus):,0.0,Jarque-Bera (JB):,774731123.497
Skew:,24.065,Prob(JB):,0.0
Kurtosis:,821.965,Cond. No.,15100.0


In [15]:
# Using taxi distance

data5 = data[
    [
        "price",
        "new_neighbourhood",
        "room_type",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "number_of_reviews_ltm",
        "station_dist2",
        "park_dist2",
    ]
]
data5 = pd.get_dummies(data5, columns=["new_neighbourhood"])
data5.drop(["new_neighbourhood_Greenpoint"], axis=1, inplace=True)
data5 = pd.get_dummies(data5, columns=["room_type"])
data5.drop(["room_type_Private room"], axis=1, inplace=True)
data5.astype(float)

X5 = data5
data5.drop(["price"], axis=1, inplace=True)
X5C = sm.add_constant(X5)

est5 = sm.OLS(Y, X5C)
est_5 = est5.fit()
est_5.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.07
Model:,OLS,Adj. R-squared:,0.062
Method:,Least Squares,F-statistic:,8.831
Date:,"Thu, 12 May 2022",Prob (F-statistic):,2.37e-275
Time:,20:34:32,Log-Likelihood:,-192700.0
No. Observations:,27627,AIC:,385900.0
Df Residuals:,27394,BIC:,387800.0
Df Model:,232,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,131.2326,12.139,10.811,0.000,107.440,155.025
number_of_reviews,-0.1257,0.034,-3.744,0.000,-0.192,-0.060
reviews_per_month,0.2268,0.497,0.456,0.648,-0.747,1.201
calculated_host_listings_count,-0.1122,0.064,-1.739,0.082,-0.239,0.014
number_of_reviews_ltm,0.1893,0.140,1.348,0.178,-0.086,0.465
station_dist2,-10.2989,7.609,-1.353,0.176,-25.213,4.615
park_dist2,15.8358,6.540,2.421,0.015,3.017,28.655
new_neighbourhood_Allerton,-71.3545,82.977,-0.860,0.390,-233.993,91.284
new_neighbourhood_Annadale-Huguenot-Prince's Bay-Woodrow,98.2383,190.014,0.517,0.605,-274.200,470.676

0,1,2,3
Omnibus:,65462.802,Durbin-Watson:,1.846
Prob(Omnibus):,0.0,Jarque-Bera (JB):,774713510.178
Skew:,24.064,Prob(JB):,0.0
Kurtosis:,821.956,Cond. No.,15900.0


In [None]:
# Taxi distance seems to be a better indicator than linear distance. 