# Tutorial 2 - SVM Regression

We will predict the price (`price` column) of an AirBNB listing in Boston given a number of features about the listing.

**Therefore, our unit of analysis is an AIRBNB LISTING**

# Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [2]:
#We will predict the "price" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_75-150


# Split the data into train and test

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(airbnb, test_size=0.3)

### Be careful: we haven't seperated the target column yet

## Check the missing values

In [4]:
train_set.isna().sum()

host_is_superhost                       0
host_identity_verified                  0
neighbourhood_cleansed                  0
latitude                                0
longitude                               0
property_type                           8
room_type                               0
accommodates                            0
bathrooms                              19
bedrooms                               19
beds                                   16
bed_type                                0
Number of amenities                     0
guests_included                         0
price_per_extra_person                  0
minimum_nights                          0
number_of_reviews                       0
number_days_btw_first_last_review       0
review_scores_rating                 1609
cancellation_policy                     0
price                                   0
price_gte_150                           0
price_category                          0
dtype: int64

In [5]:
test_set.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          1
room_type                              0
accommodates                           0
bathrooms                             17
bedrooms                              11
beds                                   8
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 674
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

# Data Prep

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Drop the variables we can't use in this tutorial

In [7]:
# We can't use the following columns in this tutorial, because they are for classification tasks

train = train_set.drop(['price_gte_150', 'price_category'], axis=1)
test = test_set.drop(['price_gte_150', 'price_category'], axis=1)

## Separate the target variable (we don't want to transform it)

In [8]:
train_y = train[['price']]
test_y = test[['price']]

train_inputs = train.drop(['price'], axis=1)
test_inputs = test.drop(['price'], axis=1)

##  Identify the numerical and categorical columns

In [9]:
train_inputs.dtypes

host_is_superhost                      int64
host_identity_verified                 int64
neighbourhood_cleansed                object
latitude                             float64
longitude                            float64
property_type                         object
room_type                             object
accommodates                           int64
bathrooms                            float64
bedrooms                             float64
beds                                 float64
bed_type                              object
Number of amenities                    int64
guests_included                        int64
price_per_extra_person                 int64
minimum_nights                         int64
number_of_reviews                      int64
number_days_btw_first_last_review      int64
review_scores_rating                 float64
cancellation_policy                   object
dtype: object

**At this stage, you can manually identify numeric, binary, and categorical columns as follows:**

`numeric_columns = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 'guests_included', 'price_per_extra_person', 'minimum_nights', 'number_of_reviews', 'number_days_btw_first_last_review', 'review_scores_rating']`
 
 `binary_columns = ['host_is_superhost', 'host_identity_verified']`
 
 `categorical_columns = ['neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']`
 
<br>
 
**If you do not want to manually type these, you can do the below tricks:**

In [10]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [11]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['host_is_superhost', 'host_identity_verified']

In [12]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [13]:
binary_columns

['host_is_superhost', 'host_identity_verified']

In [14]:
numeric_columns

['latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'Number of amenities',
 'guests_included',
 'price_per_extra_person',
 'minimum_nights',
 'number_of_reviews',
 'number_days_btw_first_last_review',
 'review_scores_rating']

In [15]:
categorical_columns

['neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type',
 'cancellation_policy']

# Pipeline

In [16]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())])

In [17]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [18]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [19]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [20]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-2.10940159, -1.39824237,  1.20477863, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.61906783, -1.38593382, -1.16133947, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.14448465, -0.16705969, -1.16133947, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.82039585,  0.74441303, -0.56980994, ...,  0.        ,
         0.        ,  0.        ],
       [-0.92762441,  0.3821493 , -0.56980994, ...,  0.        ,
         0.        ,  1.        ],
       [-0.34071414, -0.53929512, -1.16133947, ...,  0.        ,
         0.        ,  0.        ]])

In [21]:
train_x.shape

(7190, 61)

# Tranform: transform() for TEST

In [22]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 0.63069768,  0.40533687,  1.79630816, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.15153485,  0.27611111, -0.56980994, ...,  0.        ,
         0.        ,  1.        ],
       [-2.02789334, -0.91924215,  0.02171958, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.15906806, -0.38872897,  1.20477863, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.11838687, -0.56878308, -0.56980994, ...,  0.        ,
         0.        ,  0.        ],
       [-0.94171792,  0.19283558, -1.16133947, ...,  0.        ,
         0.        ,  1.        ]])

In [23]:
test_x.shape

(3082, 61)

# Calculate the baseline

In [24]:
from sklearn.metrics import mean_squared_error

In [25]:
#First find the average value of the target

mean_value = np.mean(train_y['price'])

mean_value

158.68289290681503

In [26]:
# Predict all values as the mean

baseline_pred = np.repeat(mean_value, len(test_y))

baseline_pred

array([158.68289291, 158.68289291, 158.68289291, ..., 158.68289291,
       158.68289291, 158.68289291])

In [27]:
baseline_mse = mean_squared_error(test_y, baseline_pred)

baseline_rmse = np.sqrt(baseline_mse)

print('Baseline RMSE: {}' .format(baseline_rmse))

Baseline RMSE: 91.62041893023937


In [28]:
train_y['price']

3437    150
6622     45
2262    100
2246    142
835     229
       ... 
5734    249
5191    100
5390    275
860     100
7270     59
Name: price, Length: 7190, dtype: int64

# LinearSVR

This is the support vector regressor. Preferred over `SVR(kernel='linear')`

In [29]:
from sklearn.svm import LinearSVR 

svm_reg = LinearSVR(C=100, epsilon=0.5, max_iter=10000) 

svm_reg.fit(train_x, train_y)

  return f(*args, **kwargs)


LinearSVR(C=100, epsilon=0.5, max_iter=10000)

In [30]:
#Train RMSE
svm_train_pred = svm_reg.predict(train_x)

train_mse = mean_squared_error(train_y, svm_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 53.126125942978966


In [31]:
#Test RMSE
svm_test_pred = svm_reg.predict(test_x)

test_mse = mean_squared_error(test_y, svm_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 54.14912301578952


# SVR(kernel='poly')

In [32]:
from sklearn.svm import SVR 

svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.5) 

svm_poly_reg.fit(train_x, train_y)

  return f(*args, **kwargs)


SVR(C=100, degree=2, epsilon=0.5, kernel='poly')

In [33]:
#Train RMSE
svm_train_pred = svm_poly_reg.predict(train_x)

train_mse = mean_squared_error(train_y, svm_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 45.99689122102427


In [34]:
#Test RMSE
svm_test_pred = svm_poly_reg.predict(test_x)

test_mse = mean_squared_error(test_y, svm_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 49.2768921385528


# SVR(kernel='rbf')

In [35]:
#Takes too long to converge

svm_rbf_reg = SVR(kernel="rbf", C=100, epsilon=0.01, gamma='scale') 

svm_rbf_reg.fit(train_x, train_y)

  return f(*args, **kwargs)


SVR(C=100, epsilon=0.01)

In [36]:
#Train RMSE
svm_train_pred = svm_rbf_reg.predict(train_x)

train_mse = mean_squared_error(train_y, svm_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 37.784450316096745


In [37]:
#Test RMSE
svm_test_pred = svm_rbf_reg.predict(test_x)

test_mse = mean_squared_error(test_y, svm_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 42.60969329891748


# Cross Validation

We perform k-fold cross-validation.<br>
See this link for more info: https://scikit-learn.org/stable/modules/cross_validation.html

In [38]:
from sklearn.model_selection import cross_validate

In [39]:
scores = cross_validate(svm_rbf_reg, train_x, 
                        train_y, cv=5, scoring="neg_mean_squared_error",
                        return_estimator=True)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [40]:
scores

{'fit_time': array([2.71915245, 2.58870721, 2.71825051, 2.78045344, 2.84113479]),
 'score_time': array([0.76036644, 0.78479314, 0.8685894 , 0.83959818, 0.81923056]),
 'estimator': [SVR(C=100, epsilon=0.01),
  SVR(C=100, epsilon=0.01),
  SVR(C=100, epsilon=0.01),
  SVR(C=100, epsilon=0.01),
  SVR(C=100, epsilon=0.01)],
 'test_score': array([-1711.31001406, -2112.61081466, -1711.47735998, -2097.54914461,
        -1984.72842291])}

In [41]:
# test scores are negative and squared

scores['test_score']

array([-1711.31001406, -2112.61081466, -1711.47735998, -2097.54914461,
       -1984.72842291])

In [42]:
rmse_scores = np.sqrt(-scores['test_score'])

rmse_scores

array([41.36798296, 45.96314627, 41.37000556, 45.79900812, 44.55029094])

In [43]:
rmse_scores.mean(), rmse_scores.std()

(43.81008676854567, 2.052181418734238)

In [44]:
# How can you make predictions: Use each estimator...

test_predictions = pd.DataFrame()

test_predictions['estimator_0'] = scores['estimator'][0].predict(test_x)
test_predictions['estimator_1'] = scores['estimator'][1].predict(test_x)
test_predictions['estimator_2'] = scores['estimator'][2].predict(test_x)
test_predictions['estimator_3'] = scores['estimator'][3].predict(test_x)
test_predictions['estimator_4'] = scores['estimator'][4].predict(test_x)

In [45]:
test_predictions

Unnamed: 0,estimator_0,estimator_1,estimator_2,estimator_3,estimator_4
0,385.733016,381.779184,364.449493,366.882267,362.409963
1,149.989773,149.990595,147.849009,150.010249,149.989852
2,70.010419,69.989765,81.966305,70.009690,70.010234
3,92.423080,85.927236,92.989761,91.568603,88.612357
4,91.319841,77.836474,88.014999,81.631724,84.188518
...,...,...,...,...,...
3077,51.989873,52.010187,51.190998,52.009796,52.010076
3078,150.009991,150.009814,150.009978,155.310265,150.010117
3079,359.445423,368.008988,371.600687,359.796627,371.376166
3080,74.989866,74.989911,67.969237,72.725719,61.091385


In [46]:
# Take the average of all predictions:

test_predictions['avg_prediction'] = np.mean(test_predictions, axis=1)

test_predictions

Unnamed: 0,estimator_0,estimator_1,estimator_2,estimator_3,estimator_4,avg_prediction
0,385.733016,381.779184,364.449493,366.882267,362.409963,372.250785
1,149.989773,149.990595,147.849009,150.010249,149.989852,149.565896
2,70.010419,69.989765,81.966305,70.009690,70.010234,72.397283
3,92.423080,85.927236,92.989761,91.568603,88.612357,90.304208
4,91.319841,77.836474,88.014999,81.631724,84.188518,84.598311
...,...,...,...,...,...,...
3077,51.989873,52.010187,51.190998,52.009796,52.010076,51.842186
3078,150.009991,150.009814,150.009978,155.310265,150.010117,151.070033
3079,359.445423,368.008988,371.600687,359.796627,371.376166,366.045578
3080,74.989866,74.989911,67.969237,72.725719,61.091385,70.353224


In [47]:
# Test RMSE

test_mse = mean_squared_error(test_y, test_predictions['avg_prediction'])

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 43.313333352088144
