In [1]:
# Libraries set-up

%load_ext lab_black
import os
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
# Import data

data_2021 = pd.read_csv("../data/newnh_airbnb_2021.csv")

In [3]:
# Keep interesting subset of columns for all analyses, drop empty rows

data = data_2021[
    [
        "price",
        "neighbourhood",
        "neighbourhood_group",
        "room_type",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "number_of_reviews_ltm",
    ]
]
data = data.dropna()

In [4]:
# Set up price as Y (dependent variable) for all regressions

Y = data["price"]

In [5]:
# Set up data for simple linear regression of price on neighborhood group, Bronx used as baseline

datang = data[
    [
        "price",
        "neighbourhood_group",
    ]
]
datang = pd.get_dummies(datang, columns=["neighbourhood_group"])
datang.drop(["neighbourhood_group_Bronx"], axis=1, inplace=True)
datang.astype(float)

Xng = datang
datang.drop(["price"], axis=1, inplace=True)
XngC = sm.add_constant(Xng)

In [6]:
# Run Linear regression of price on neighborhood group dummies

estng = sm.OLS(Y, XngC)
est_ng = estng.fit()
est_ng.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.016
Method:,Least Squares,F-statistic:,110.6
Date:,"Thu, 12 May 2022",Prob (F-statistic):,1.16e-93
Time:,06:58:12,Log-Likelihood:,-193480.0
No. Observations:,27627,AIC:,387000.0
Df Residuals:,27622,BIC:,387000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,94.8289,9.515,9.966,0.000,76.178,113.480
neighbourhood_group_Brooklyn,43.7167,9.841,4.442,0.000,24.427,63.006
neighbourhood_group_Manhattan,99.6544,9.832,10.135,0.000,80.382,118.926
neighbourhood_group_Queens,17.6429,10.458,1.687,0.092,-2.855,38.141
neighbourhood_group_Staten Island,8.3145,18.740,0.444,0.657,-28.417,45.046

0,1,2,3
Omnibus:,63756.943,Durbin-Watson:,1.847
Prob(Omnibus):,0.0,Jarque-Bera (JB):,641016693.598
Skew:,22.579,Prob(JB):,0.0
Kurtosis:,747.864,Cond. No.,17.0


In [None]:
# Exporting image of the above results

plt.rc("figure", figsize=(16 / 1.5, 10 / 1.5), dpi=900)
plt.text(0.01, 0.05, str(est_ng.summary()), {"fontsize": 9}, fontproperties="monospace")
plt.axis("off")
plt.tight_layout()
os.makedirs("../artifacts", exist_ok=True)
plt.savefig("../artifacts/reg1.jpeg")

In [7]:
# Set up data for simple linear regression of price on neighborhood, Midtown used as baseline

datan = data[
    [
        "price",
        "neighbourhood",
    ]
]
datan = pd.get_dummies(datan, columns=["neighbourhood"])
datan.drop(["neighbourhood_Midtown"], axis=1, inplace=True)
datan.astype(float)

Xn = datan
datan.drop(["price"], axis=1, inplace=True)
XnC = sm.add_constant(Xn)

In [8]:
# Run Linear regression of price on neighborhood group dummies

estn = sm.OLS(Y, XnC)
est_n = estn.fit()
est_n.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.042
Model:,OLS,Adj. R-squared:,0.034
Method:,Least Squares,F-statistic:,5.461
Date:,"Thu, 12 May 2022",Prob (F-statistic):,4.16e-131
Time:,06:58:24,Log-Likelihood:,-193110.0
No. Observations:,27627,AIC:,386700.0
Df Residuals:,27406,BIC:,388500.0
Df Model:,220,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,262.7500,8.676,30.285,0.000,245.745,279.755
neighbourhood_Allerton,-142.4167,51.490,-2.766,0.006,-243.339,-41.494
neighbourhood_Arden Heights,-78.0000,132.147,-0.590,0.555,-337.015,181.015
neighbourhood_Arrochar,-149.1618,64.548,-2.311,0.021,-275.679,-22.644
neighbourhood_Arverne,7.8633,31.664,0.248,0.804,-54.200,69.926
neighbourhood_Astoria,-147.4613,14.858,-9.924,0.000,-176.585,-118.338
neighbourhood_Bath Beach,-152.3750,66.499,-2.291,0.022,-282.717,-22.033
neighbourhood_Battery Park City,-65.1603,43.112,-1.511,0.131,-149.661,19.341
neighbourhood_Bay Ridge,-170.7385,29.575,-5.773,0.000,-228.708,-112.769

0,1,2,3
Omnibus:,64468.599,Durbin-Watson:,1.85
Prob(Omnibus):,0.0,Jarque-Bera (JB):,695918171.181
Skew:,23.188,Prob(JB):,0.0
Kurtosis:,779.147,Cond. No.,170.0


In [None]:
# Exporting image of the above results

plt.rc("figure", figsize=(16 / 1.5, 10 / 1.5), dpi=900)
plt.text(0.01, 0.05, str(est_n.summary()), {"fontsize": 9}, fontproperties="monospace")
plt.axis("off")
plt.tight_layout()
os.makedirs("../artifacts", exist_ok=True)
plt.savefig("../artifacts/reg2.jpeg")