# Support Vector Regression Model

In [21]:
import sys, os
src_abspath = os.path.abspath(os.path.join(os.path.split(os.getcwd())[0], 'src'))
sys.path.append(src_abspath)
from db import *
from clf import *

import warnings
warnings.filterwarnings('ignore')

In [5]:
am_debugging = True
if am_debugging:
    np.random.seed(2)
else:
    np.random.seed()

## Data/Features
#### Composite Dataset

In [6]:
data = feat_iculos().iloc[:500]

#### Partition: Training & Testing Sets

In [7]:
X_train, X_test, y_train, y_test = data_partition(data)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((350, 16), (150, 16), (350,), (150,))

#### Normalization

In [8]:
scaled_X_train, scaled_X_test = scale(X_train, X_test)

#### Feature Selection

In [9]:
pass

## Model: Supervised Learning

In [11]:
def gridsearch(X_train, X_test, y_train):
    """
    Function determines the optimal parameters of the best classifier model/estimator by performing a grid search.
    The best model will be fitted with the Training set and subsequently used to predict the classification/labels
    of the Testing set. The function returns the "best" classifier instance, classifier predictions, best parameters,
    and grid score.

    :param X_train: Training set features
    :param X_test: Testing set features
    :param y_train: Training set labels
    :return: tuple of (best classifier instance, clf predictions, dict of best parameters, grid score)
    """
    # Parameter Grid - dictionary of parameters (map parameter names to values to be searched)
    param_grid = [
        {'C': [0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear']},
        {'C': [0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['rbf']}
#         {'C':[0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10], 'degree': [2], 'kernel': ['poly']}
    ]

    
    # Blank clf instance
    blank_clf = SVR()

    # Grid Search - Hyperparameters Optimization
    clf = grid_search.GridSearchCV(blank_clf, param_grid, n_jobs=-1)  # classifier + optimal parameters
    clf = clf.fit(X_train, y_train)  # fitted classifier
    best_est = clf.best_estimator_
    clf_pred = best_est.predict(X_test)

    best_params = clf.best_params_  # best parameters identified by grid search
    score = clf.best_score_  # best grid score
    return (best_est, clf_pred, best_params, score)

In [12]:
clf_model, y_pred, best_p, best_score = gridsearch(scaled_X_train, scaled_X_test, y_train)
print "Best Parameters: ", best_p
print "Best Grid Search Score: ", best_score
print "Best Estimator: ", clf_model, "\n"

Best Parameters:  {'kernel': 'rbf', 'C': 1000, 'gamma': 10}
Best Grid Search Score:  -0.0301487243959
Best Estimator:  SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=10,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 



#### Compare Predictions
Compare predictions of last 10 data points against the true values (`y_test`).

In [13]:
y_pred[-10:]

array([ 126.07165657,   77.24534209,   52.85958745,   77.99306727,
        239.20961012,   70.66122518,   51.70151659,   28.28458761,
        233.67211654,  163.63777674])

In [14]:
y_test[-10:]

array([ 182.83,   48.12,   46.09,   26.63,   54.03,   52.28,   70.41,
         95.  ,  358.92,   12.44])

### Metrics

#### Root Mean Squared Error (RMSE)
Indicates the *absolute fit* of the model to the data. In other words, how close the observed data points are to the model's predicted values.

Also, it is the standard deviation of unexplained variance (residuals).

In [22]:
rmse = RMSE(y_pred, y_test)
print "RMSE = %.3f days" % rmse

RMSE = 106.786 days


In [23]:
print "[%.2f,%.2f]" % (y_test.min(), y_test.max())

[5.16,588.23]


The linear regression model has an RMSE value of 106.508 days. The value is somewhat small relative to the range of the `dependent variable` (average ICU LOS). Thus, indicating that model has decent accuracy in predicting the response.

#### Mean Absolute Error
The Mean Absolute Error measures how close the model's predictions are to the observed values.

In [24]:
mae = MAE(y_pred, y_test)
print "MAE = %.3f days" % mae

MAE = 53.378 days


The average difference between prediction and observation is 66.8 days, which is high.

#### R<sup>2</sup>: Coefficient of Determination
R<sup>2</sup> quantifies the goodness of fit of the linear model. More specifically, it depicts the predictive power of the model.

Range: [0, 1]

In [25]:
r2(y_pred, y_test)

-0.13129762090156394

The R<sup>2</sup> value is approximately 0.02, which is low and indicates that the model does not have strong predictive power.