# **SVR MODEL FOR HOUSE PRICES**

In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

%matplotlib inline 
import matplotlib.pyplot as plt

In [2]:
# fetch data
train = pd.read_csv('train_engineered.csv')
test = pd.read_csv('test_engineered.csv')
outcomes = pd.read_csv('outcomes.csv')
y = np.asarray(outcomes['SalePrice'].values)
train_id = train['Id']; test_id = test['Id']
del train['Id']
del test['Id']

# feature selection
features_selected =['AllSF', 'OverallQual', 'AllFlrsSF', '1stFlr_2ndFlr_Sf', 'GrLivArea',
                    'All_Liv_SF', 'ExterQual', 'TotalBath', 'KitchenQual', 'GarageCars',
                    'OverallGrade', '1stFlrSF', 'ExterGrade', 'YearBuilt', 'FullBath',
                    'YearRemodAdd', 'TotRmsAbvGrd', 'FireplaceScore', 'FireplaceQu',
                    'Foundation_PConc', 'BsmtQual', 'GarageArea', 'Fireplaces',
                    'GarageScore', 'HeatingQC', 'OpenPorchSF', 'TotalBsmtSF',
                    'KitchenScore', 'MasVnrArea', 'GarageFinish_Fin', 'GarageType_Attchd',
                    'LotArea', 'HasMasVnr', 'LotFrontage', 'GarageGrade', 'GarageQual',
                    'GarageCond', 'Neighborhood_NridgHt', 'CentralAir_Y', 'WoodDeckSF',
                    'Exterior2nd_VinylSd', 'Exterior1st_VinylSd', 'BsmtExposure',
                    'SaleType_New', 'GarageYrBlt', 'BoughtOffPlan', 'SaleCondition_Partial',
                    'HalfBath', 'MasVnrType_Stone', 'BsmtFinType1', 'RecentRemodel', 'lat',
                    'IsElectricalSBrkr', 'Electrical_SBrkr', 'PavedDrive', 'HasWoodDeck',
                    'GarageType_No', 'GarageFinish_No', 'Foundation_CBlock', 'MSZoning_RM',
                    'CentralAir_N', 'MasVnrType_None', 'GarageType_Detchd', 
                    'IsGarageDetached', 'GarageFinish_Unf', 'HasOpenPorch']

# split features observations for train and forecast
X = np.asarray(train[features_selected])
X_forecast = np.asarray(test[features_selected])
print(X.shape, y.shape, X_forecast.shape)

FileNotFoundError: [Errno 2] File train_engineered.csv does not exist: 'train_engineered.csv'

In [None]:
# split the dataset in train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
# set the parameters by cross-validation
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train.reshape(-1, 1))

pipe = Pipeline(steps=[('scaler', StandardScaler()), ('estimator', SVR())])
param_grid=dict(estimator__kernel = ['rbf'],
                estimator__C = [1.0, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 2.0, 2.2, 2.4],
                estimator__epsilon = [0.045, 0.055, 0.06, 0.065, 0.07, 0.075, 0.08, 0.09, 0.1])

search = GridSearchCV(pipe, param_grid, n_jobs=-1)
search.fit(X_train, y_train.ravel())
print(search.best_params_)
#best model: {'estimator__C': 1.4, 'estimator__epsilon': 0.06, 'estimator__kernel': 'rbf'}

In [None]:
# prediction for the validation set
yhat = y_scaler.inverse_transform(search.predict(X_test))
yhat = np.expm1(yhat)
y_test = np.expm1(y_test)
yhat[0:5]

In [None]:
# plot a few predictions versus outcomes
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(y_test, yhat))
print(rms)

#x = np.arange(20)
#plt.plot(x, yhat[0:20], marker = 's', linestyle = 'None')
#plt.plot(x, y_test[0:20], marker = 'o', linestyle = 'None')
#plt.show()

In [None]:
# prediction for given test set
prediction = y_scaler.inverse_transform(search.predict(X_forecast))
prediction = np.expm1(prediction)
prediction[0:5]

In [None]:
# check difference with previous submission
#previous = pd.read_csv('submission.csv')
#previous = np.asarray(previous['SalePrice'])
#plt.figure(figsize=(10,10))
#idx = np.where(prediction > 300000)
#plt.scatter(prediction[idx], previous[idx])
#plt.show()

In [None]:
#pd.DataFrame(prediction).to_csv('yhat.csv')