# Title

* bullet point
* bullet point

In [1]:
# Libraries set-up

%load_ext lab_black
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
from sklearn.metrics import mean_squared_error
from readtable import getairbnbdata

----------------
## Data Loading and Manipulation

In [2]:
# Import data and keep interesting subset of columns for all analyses, drop empty rows,
# remove data with wrong boroughs, rename borough and neighborhood columns
data = (
    pd.read_csv("../data/cleaned_data_updated.csv",)[
        [
            "price",
            "new_neighbourhood",
            "neighbourhood",
            "number_of_reviews",
            "review_scores_rating",
            "review_scores_accuracy",
            "review_scores_cleanliness",
            "review_scores_checkin",
            "review_scores_communication",
            "review_scores_location",
            "review_scores_value",
            "station_dist",
            "station_dist2",
            "park_dist",
            "park_dist2",
            "beds",
            "accommodates",
        ]
    ]
    .dropna()
    .reset_index()
    .rename(columns={"neighbourhood": "borough", "new_neighbourhood": "neighbourhood"})
)

In [3]:
# tidying the borough coloumn
data["borough"] = data["borough"].str.replace(", New York, United States", "")
data = data[
    (data["borough"] == "Brooklyn")
    | (data["borough"] == "Manhattan")
    | (data["borough"] == "Queens")
    | (data["borough"] == "Bronx")
    | (data["borough"] == "Staten Island")
].reset_index()

----------------
## Regression Analysis

In [4]:
# Set up price as Y (dependent variable) for all regressions

Y = data["price"]

In [5]:
# Define function that outputs regressor matrix depending on what we want to do using subset of data taking as an input a list of regressors

regressors = []


def Xcreator(regressors):
    datax = data[regressors]
    if "borough" in datax.columns:
        datax = pd.get_dummies(datax, columns=["borough"])
        datax = datax.drop(columns="borough_Bronx", axis=1)
    if "neighbourhood" in datax.columns:
        datax = pd.get_dummies(datax, columns=["neighbourhood"])
        datax = datax.drop(columns="neighbourhood_Allerton", axis=1)
    Xt = datax
    X = sm.add_constant(Xt)
    return X

In [6]:
# Define code that outputs regression


def regress(y, x):
    est = sm.OLS(y, x)
    estr = est.fit()
    print(estr.summary())

First, we run the regression on boroughs only.

In [7]:
X1 = Xcreator(["borough"])
regress(Y, X1)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     97.55
Date:                Fri, 13 May 2022   Prob (F-statistic):           6.15e-82
Time:                        00:21:59   Log-Likelihood:                -82737.
No. Observations:               13058   AIC:                         1.655e+05
Df Residuals:                   13053   BIC:                         1.655e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    98.52

  x = pd.concat(x[::order], 1)


Next, we run the regression on neighbourhoods.

In [8]:
X2 = Xcreator(["neighbourhood"])
regress(Y, X2)

  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.136
Model:                            OLS   Adj. R-squared:                  0.124
Method:                 Least Squares   F-statistic:                     11.46
Date:                Fri, 13 May 2022   Prob (F-statistic):          3.00e-285
Time:                        00:21:59   Log-Likelihood:                -81974.
No. Observations:               13058   AIC:                         1.643e+05
Df Residuals:                   12880   BIC:                         1.656e+05
Df Model:                         177                                         
Covariance Type:            nonrobust                                         
                                                                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

Since neighbourhoods are a better performing regressor than boroughs, we choose to have neighbourhoods as a feature in the next regression along with number of reviews, all, review scores, and other controls

In [9]:
X3 = Xcreator(
    [
        "neighbourhood",
        "number_of_reviews",
        "review_scores_rating",
        "review_scores_accuracy",
        "review_scores_cleanliness",
        "review_scores_checkin",
        "review_scores_communication",
        "review_scores_location",
        "review_scores_value",
        "beds",
        "accommodates",
    ]
)
regress(Y, X3)

  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.470
Model:                            OLS   Adj. R-squared:                  0.462
Method:                 Least Squares   F-statistic:                     60.99
Date:                Fri, 13 May 2022   Prob (F-statistic):               0.00
Time:                        00:21:59   Log-Likelihood:                -78786.
No. Observations:               13058   AIC:                         1.579e+05
Df Residuals:                   12870   BIC:                         1.594e+05
Df Model:                         187                                         
Covariance Type:            nonrobust                                         
                                                                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

Finally, we add to the former regression two regressors: distance from nearest park and nearest subway station.

We first do this using linear distance and second by using taxi distance and choose which is better.

In [10]:
# Linear Distance

X4 = Xcreator(
    [
        "neighbourhood",
        "number_of_reviews",
        "review_scores_rating",
        "review_scores_accuracy",
        "review_scores_cleanliness",
        "review_scores_checkin",
        "review_scores_communication",
        "review_scores_location",
        "review_scores_value",
        "beds",
        "accommodates",
        "station_dist",
        "park_dist",
    ]
)
regress(Y, X4)

  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.471
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     60.55
Date:                Fri, 13 May 2022   Prob (F-statistic):               0.00
Time:                        00:22:00   Log-Likelihood:                -78775.
No. Observations:               13058   AIC:                         1.579e+05
Df Residuals:                   12868   BIC:                         1.594e+05
Df Model:                         189                                         
Covariance Type:            nonrobust                                         
                                                                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [11]:
# Taxi Distance

X5 = Xcreator(
    [
        "neighbourhood",
        "number_of_reviews",
        "review_scores_rating",
        "review_scores_accuracy",
        "review_scores_cleanliness",
        "review_scores_checkin",
        "review_scores_communication",
        "review_scores_location",
        "review_scores_value",
        "beds",
        "accommodates",
        "station_dist2",
        "park_dist2",
    ]
)
regress(Y, X5)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.470
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     60.49
Date:                Fri, 13 May 2022   Prob (F-statistic):               0.00
Time:                        00:22:00   Log-Likelihood:                -78778.
No. Observations:               13058   AIC:                         1.579e+05
Df Residuals:                   12868   BIC:                         1.594e+05
Df Model:                         189                                         
Covariance Type:            nonrobust                                         
                                                                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

  x = pd.concat(x[::order], 1)


Linear distance seems to be a better indicator than taxi distance.

Next, we find the RMSE of the best regression (so price on neighbourhoods and controls including linear distances)

In [12]:
model = LinearRegression()
model.fit(X4, Y)
X_predict = X4
Y_predict = model.predict(X_predict)
rmse = mean_squared_error(Y, Y_predict, squared=False)
rmse

100.8644758561287