# Multiple Linear Regression for Airbnb Data

In the second part, we will illustrate the use of multiple linear regression for finding the effect of important factors affecting the sale of Airbnb hosting.
The data for the  is taken generously from Kaggle's [New York City Airbnb Open Data](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data).
For the second part, we will focus on the task of predicting the popularity of airbnb listing, which is indicated by the number of reviews from the dataset:

<img src="airbnb.png" height="300" width="300">


The factors we will be taking into considerations for the second part are: neighbourhood_group, neighbourhood, latitude, longitude, room_type, price, minimum_nights, availability_365.

In [209]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_kernels
import scipy
from sklearn import svm, linear_model
from sklearn.model_selection import GridSearchCV
import time

In [210]:
df = pd.read_csv('AB_NYC_2019.csv',
                na_values='?',header=None)
df.columns = df.iloc[0]
df = df[1:]
df = df.dropna()
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
1,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
2,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
4,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
5,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
6,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


In [211]:
xnames = ['neighbourhood_group','latitude','longitude','room_type','price',
         'minimum_nights','availability_365']
X = np.array(df[xnames])
ID = np.array(df['id'])
ID = ID.astype(int)
y = np.array(df['number_of_reviews'])
y = y.astype(int)
# Convert neighborhood to Brooklyn-0, Manhattan-1, Others-2
neighbourhood_group = X[:,0]
for i in range(len(neighbourhood_group)):
    if neighbourhood_group[i] == "Brooklyn":
        neighbourhood_group[i] = 0
    elif neighbourhood_group[i] == "Manhattan":
        neighbourhood_group[i] = 1
    else:
        neighbourhood_group[i] = 2

# Convert room_type: Private room-0, Entire home/apt-1
room_type = X[:,3]
PRI_VAL = "Private room"
ENT_VAL = "Entire home/apt"
for i in range(len(room_type)):
    if room_type[i] == PRI_VAL:
        room_type[i] = 0
    elif room_type[i] == ENT_VAL:
        room_type[i] = 1
    else:
        room_type[i] = 2
X[:,1] = X[:,1].astype(float)
X[:,2] = X[:,2].astype(float)
X[:,4] = X[:,4].astype(int)
X[:,5] = X[:,5].astype(int)
X[:,6] = X[:,6].astype(int)
print("Converted Dataset X:\n",X)
print("\nwith dimension:\n", X.shape)

Converted Dataset X:
 [[0 40.64749 -73.97237 ... 149 1 365]
 [1 40.75362 -73.98377 ... 225 1 355]
 [0 40.68514 -73.95976 ... 89 1 194]
 ...
 [2 40.54179 -74.14275 ... 235 1 87]
 [2 40.80787 -73.92399999999999 ... 100 1 40]
 [0 40.69805 -73.92801 ... 30 1 1]]

with dimension:
 (38821, 7)


# Split data into a training and test set with a ratio of 2/3 - 1/3

In [212]:
nt = len(X)
ind_lst = np.random.permutation(nt)

ntr = len(ind_lst) * 2//3
ntr_lst = ind_lst[:ntr]
nts = len(ind_lst) - ntr
nts_lst = ind_lst[ntr:]

IDtr = ID[ntr_lst]
IDts = ID[nts_lst]

Xtr = np.matrix(X[[ntr_lst]], dtype='int')
Xts = np.matrix(X[[nts_lst]], dtype='int')
ytr = y[ntr_lst]
yts = y[nts_lst]
print("X training set:\n",Xtr)
print("\nX testing set:\n",Xts)
print("\ny training set:\n",ytr)
print("\ny testing set:\n",yts)

X training set:
 [[  0  40 -73 ...  70   3 364]
 [  0  40 -73 ... 100   3  89]
 [  1  40 -73 ...  71   6   0]
 ...
 [  1  40 -74 ... 130  30   0]
 [  1  40 -73 ... 159   4  31]
 [  1  40 -73 ... 100   3  13]]

X testing set:
 [[  2  40 -73 ...  80   3  36]
 [  1  40 -73 ... 165   2   0]
 [  1  40 -73 ... 300   3 182]
 ...
 [  0  40 -73 ... 105   3   0]
 [  2  40 -73 ...  40   7  10]
 [  0  40 -73 ... 150   4 114]]

y training set:
 [30 17  2 ...  1 12  8]

y testing set:
 [20 11 30 ... 11 53 69]


# Fit a Linear Model

In [213]:
def fit_mult_linear(X,y):
    """
    Given matrix of predictors X and target vector y fit for a multiple linear regression model under the squared loss.
    """
    # TODO complete the following code
    nsamp = len(X)
    ones = np.ones((nsamp,1))
    X_orig = X
    X = np.hstack((ones,X_orig))
    Xmat=np.matrix(X)
    ymat=np.matrix(y)
    ymat=np.transpose(ymat)
    Xmatt=np.transpose(Xmat)
    beta=np.linalg.inv(Xmatt*Xmat)*Xmatt*ymat
    return beta

In [217]:
beta_lst = fit_mult_linear(Xtr, ytr)
beta0 = beta_lst[0]
beta1 = beta_lst[1:]
print("beta_lst:\n",beta_lst)
print("beta0:\n",beta0)
print("beta1:\n",beta1)

beta_lst:
 [[-4.26966340e+03]
 [-5.51007054e-01]
 [ 1.20244442e+02]
 [ 6.32627134e+00]
 [-2.24643412e-01]
 [-1.37091273e-02]
 [-2.79052494e-01]
 [ 8.02060109e-02]]
beta0:
 [[-4269.66340085]]
beta1:
 [[-5.51007054e-01]
 [ 1.20244442e+02]
 [ 6.32627134e+00]
 [-2.24643412e-01]
 [-1.37091273e-02]
 [-2.79052494e-01]
 [ 8.02060109e-02]]


In [218]:
nsamp = len(Xtr)
ones = np.ones((nsamp,1))
X_orig = Xtr
X = np.hstack((ones,X_orig))
Xmat=np.matrix(X)
ymat=np.matrix(ytr)
ymat=np.transpose(ymat)

ytr_pred = Xmat*beta_lst
lossm = np.linalg.norm(np.array(ymat - ytr_pred))**2

print("multiple variable loss="+"{:.2e}".format(lossm))

multiple variable loss=1.32e+08


# Measure the Fit on the Testing Dataset

In [220]:
beta_ts = fit_mult_linear(Xts, yts)
beta0_ts = beta_ts[0]
beta1_ts = beta_ts[1:]
print("beta_test:\n",beta_ts)
print("beta0_test:\n",beta0_ts)
print("beta1_test:\n",beta1_ts)

beta_test:
 [[ 1.68447989e+17]
 [ 1.39068163e+03]
 [-2.61323996e+15]
 [-6.53763185e+03]
 [-1.82793993e+02]
 [-1.05358733e-02]
 [ 1.98532262e-01]
 [-6.77874689e-02]]
beta0_test:
 [[1.68447989e+17]]
beta1_test:
 [[ 1.39068163e+03]
 [-2.61323996e+15]
 [-6.53763185e+03]
 [-1.82793993e+02]
 [-1.05358733e-02]
 [ 1.98532262e-01]
 [-6.77874689e-02]]


In [222]:
nsamp_test = len(Xts)
ones_test = np.ones((nsamp_test,1))
X_orig_test = Xts
X_test = np.hstack((ones_test,X_orig_test))
Xmat_test=np.matrix(X_test)
ymat_test=np.matrix(yts)
ymat_test=np.transpose(ymat_test)

ytrain_pred_test = Xmat_test*beta_test
lossm_test = np.linalg.norm(np.array(ymat_test - ytrain_pred_test))**2
print("multiple variable loss="+"{:.2e}".format(lossm_test))

multiple variable loss=5.29e+37
