# Support Vector Regression

#### Load the packages and import the data

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/USA_Housing.csv")
data.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [3]:
data.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [4]:
X = data[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population']]
y = data["Price"]

#### Split the data into a train_set and test_set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1111)

#### Center and Scale X and y

In [6]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
for i in X_train.columns:
    if X_train[i].dtypes in ["float64", "int64"]:
        mean_X_train = X_train[i].mean()
        std_X_train = X_train[i].std()
        X_train_scaled[i] = (X_train[i] - mean_X_train) / std_X_train
        X_test_scaled[i] = (X_test[i] - mean_X_train) / std_X_train
mean_y_train = y_train.mean()
std_y_train = y_train.std()
y_train_scaled = (y_train - mean_y_train) / std_y_train
y_test_scaled = (y_test - mean_y_train) / std_y_train

#### Fit the base SVR Model

In [7]:
from sklearn.svm import SVR
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train_scaled)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

#### Predict base SVR model on Test Set

In [13]:
y_pred_scaled = svr_model.predict(X_test_scaled)
y_pred = y_pred_scaled * std_y_train + mean_y_train
pred_summary = X_test.copy()
pred_summary[y.name] = y_test
pred_summary["y_pred"] = y_pred
pred_summary[y.name + "_scaled"] = y_test_scaled  # dependent variable scaled
pred_summary["y_pred_scaled"] = y_pred_scaled
pred_summary.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,y_pred,Price_scaled,y_pred_scaled
3652,65966.017208,7.876933,5.524962,3.3,42710.821809,1342819.0,1440652.0,0.319604,0.59653
1862,60288.475915,6.170239,7.014315,3.28,34651.072317,1144938.0,1077881.0,-0.240517,-0.430326
2766,53664.077704,4.415997,5.938396,2.19,57110.648936,996243.4,908456.6,-0.66141,-0.909898
4120,70169.710552,6.227945,7.651813,3.36,35197.384961,1217022.0,1374008.0,-0.036477,0.407887
3447,73092.741315,5.61546,6.524657,2.21,43509.458399,1336172.0,1298685.0,0.30079,0.194678


#### Evaluate the Model

In [17]:
from sklearn import metrics
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 2))

MSE: 10905562234.9
RMSE: 104429.699966
Explained Variance: 0.91


#### Use Cross Validation to tune the C and gamma parameters 
(This can take a long time... current setup is approximately 30 minutes)

In [18]:
from sklearn.model_selection import GridSearchCV
param_grid = {"C": [0.1, 1, 10, 100, 1000],  # Adjust these values over and over
              "epsilon": [1, 0.1, 0.01, 0.001, 0.0001]}
grid = GridSearchCV(SVR(), param_grid, verbose = 2)
grid.fit(X_train_scaled, y_train_scaled)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, epsilon=1 ................................................
[CV] ................................. C=0.1, epsilon=1, total=   0.0s
[CV] C=0.1, epsilon=1 ................................................
[CV] ................................. C=0.1, epsilon=1, total=   0.0s
[CV] C=0.1, epsilon=1 ................................................
[CV] ................................. C=0.1, epsilon=1, total=   0.0s
[CV] C=0.1, epsilon=0.1 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ............................... C=0.1, epsilon=0.1, total=   0.3s
[CV] C=0.1, epsilon=0.1 ..............................................
[CV] ............................... C=0.1, epsilon=0.1, total=   0.3s
[CV] C=0.1, epsilon=0.1 ..............................................
[CV] ............................... C=0.1, epsilon=0.1, total=   0.3s
[CV] C=0.1, epsilon=0.01 .............................................
[CV] .............................. C=0.1, epsilon=0.01, total=   0.4s
[CV] C=0.1, epsilon=0.01 .............................................
[CV] .............................. C=0.1, epsilon=0.01, total=   0.4s
[CV] C=0.1, epsilon=0.01 .............................................
[CV] .............................. C=0.1, epsilon=0.01, total=   0.4s
[CV] C=0.1, epsilon=0.001 ............................................
[CV] ............................. C=0.1, epsilon=0.001, total=   0.4s
[CV] C=0.1, epsilon=0.001 ............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 26.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'epsilon': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [19]:
grid.best_params_

{'C': 1, 'epsilon': 0.1}

#### Predict on test set using new parameters

In [20]:
grid_pred_scaled = grid.predict(X_test_scaled)
grid_pred = grid_pred_scaled * std_y_train + mean_y_train
grid_summary = X_test.copy()
grid_summary[y.name] = y_test
grid_summary["y_pred"] = grid_pred
grid_summary[y.name + "_scaled"] = y_test_scaled
grid_summary["y_pred_scaled"] = grid_pred_scaled
grid_summary.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,y_pred,Price_scaled,y_pred_scaled
3652,65966.017208,7.876933,5.524962,3.3,42710.821809,1342819.0,1440652.0,0.319604,0.59653
1862,60288.475915,6.170239,7.014315,3.28,34651.072317,1144938.0,1077881.0,-0.240517,-0.430326
2766,53664.077704,4.415997,5.938396,2.19,57110.648936,996243.4,908456.6,-0.66141,-0.909898
4120,70169.710552,6.227945,7.651813,3.36,35197.384961,1217022.0,1374008.0,-0.036477,0.407887
3447,73092.741315,5.61546,6.524657,2.21,43509.458399,1336172.0,1298685.0,0.30079,0.194678


#### Evaluate the model using new parameters

In [21]:
from sklearn import metrics
print("MSE:", metrics.mean_squared_error(y_test, grid_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, grid_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, grid_pred), 2))

MSE: 10905562234.9
RMSE: 104429.699966
Explained Variance: 0.91
