In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import joblib
%store -r X_features
%store -r X_features_poly
%store -r y_labels
from sklearn.metrics import mean_absolute_percentage_error


In [2]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


svm_lin_reg = SVR(kernel="linear",epsilon=0.001,C=10)
svm_lin_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_lin_reg.predict(X_features[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03709087835533551

In [3]:
joblib.dump(svm_lin_reg, "svm_lin_reg.pkl")

['svm_lin_reg.pkl']

In [4]:
svm_poly2_reg = SVR(kernel="poly",degree=4,epsilon=0.001,C=10)
svm_poly2_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_poly2_reg.predict(X_features[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.06025053794761864

In [5]:
joblib.dump(svm_poly2_reg, "svm_poly2_reg.pkl")

['svm_poly2_reg.pkl']

In [6]:
svm_poly3_reg = SVR(kernel="poly",degree=3,epsilon=0.001,C=10)
svm_poly3_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_poly3_reg.predict(X_features[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.049204712887437524

In [7]:
joblib.dump(svm_poly3_reg, "svm_poly3_reg.pkl")

['svm_poly3_reg.pkl']

In [8]:
svm_poly_reg = SVR(kernel="linear",epsilon=0.001,C=10)
svm_poly_reg.fit(X_features_poly[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_poly_reg.predict(X_features_poly[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03556830944932435

In [9]:
joblib.dump(svm_poly_reg, "svm_poly_reg.pkl")

['svm_poly_reg.pkl']

In [10]:
svm_rbf_reg = SVR(kernel="rbf",C=1,epsilon=0.001)
svm_rbf_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_rbf_reg.predict(X_features[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03456005538889385

In [11]:
joblib.dump(svm_rbf_reg, "svm_rbf_reg.pkl")

['svm_rbf_reg.pkl']

In [12]:
svm_rbf_poly_reg = SVR(kernel="rbf",C=1,epsilon=0.001)
svm_rbf_poly_reg.fit(X_features_poly[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_rbf_poly_reg.predict(X_features_poly[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03500197466223391

In [13]:
joblib.dump(svm_rbf_poly_reg, "svm_rbf_poly_reg.pkl")

['svm_rbf_poly_reg.pkl']

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit


split_index = [-1]*len(X_features[0]) + [0]*len(X_features[1])
X = np.concatenate((X_features[0], X_features[1]), axis=0)
y = np.concatenate((y_labels[0], y_labels[1]), axis=0)
pds = PredefinedSplit(test_fold = split_index)

param_grid = [
        {'kernel': ['linear'], 'C': [0.1,1.,10., 30.]},
        {'kernel': ['rbf'], 'C': [0.1,1.0, 3.0, 10., 30.],
         'epsilon': [0.001, 0.005, 0.01, 0.05, 0.1, 1,1.5]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=pds, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X, y)

Fitting 1 folds for each of 39 candidates, totalling 39 fits
[CV] END ...............................C=0.1, kernel=linear; total time=   0.3s
[CV] END ...............................C=1.0, kernel=linear; total time=   0.6s
[CV] END ..............................C=10.0, kernel=linear; total time=   2.1s
[CV] END ..............................C=30.0, kernel=linear; total time=   5.6s
[CV] END ...................C=0.1, epsilon=0.001, kernel=rbf; total time=   3.2s
[CV] END ...................C=0.1, epsilon=0.005, kernel=rbf; total time=   3.2s
[CV] END ....................C=0.1, epsilon=0.01, kernel=rbf; total time=   2.9s
[CV] END ....................C=0.1, epsilon=0.05, kernel=rbf; total time=   1.4s
[CV] END .....................C=0.1, epsilon=0.1, kernel=rbf; total time=   0.4s
[CV] END .......................C=0.1, epsilon=1, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, epsilon=1.5, kernel=rbf; total time=   0.0s
[CV] END ...................C=1.0, epsilon=0.001

In [15]:
grid_search.best_estimator_

In [16]:
joblib.dump(grid_search.best_estimator_, "optimized_svm_regressor.pkl")

['optimized_svm_regressor.pkl']

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

split_index = [-1]*len(X_features[0]) + [0]*len(X_features[1])
X = np.concatenate((X_features[0], X_features[1]), axis=0)
y = np.concatenate((y_labels[0], y_labels[1]), axis=0)
pds = PredefinedSplit(test_fold = split_index)
# see https://docs.scipy.org/doc/scipy/reference/stats.html
# for `expon()` and `reciprocal()` documentation and more probability distribution functions.

# Note: gamma is ignored when kernel is "linear"
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(1, 100),
        'epsilon': expon(scale=1.0),
    }

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=pds, scoring='neg_mean_squared_error',
                                verbose=2)
rnd_search.fit(X, y)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV] END C=4.448478363023492, epsilon=0.3189055896114785, kernel=linear; total time=   0.1s
[CV] END C=4.256719172504229, epsilon=0.31618465502254717, kernel=rbf; total time=   0.0s
[CV] END C=95.94832998399855, epsilon=0.4235539599430035, kernel=rbf; total time=   0.0s
[CV] END C=94.27853249102334, epsilon=0.6937056284491532, kernel=linear; total time=   0.0s
[CV] END C=1.9357529569406178, epsilon=0.14296815426373755, kernel=rbf; total time=   0.2s
[CV] END C=55.93956203193279, epsilon=0.7538287447500297, kernel=linear; total time=   0.0s
[CV] END C=22.269542191528515, epsilon=1.8893525005106941, kernel=linear; total time=   0.0s
[CV] END C=67.76746148509858, epsilon=0.16261066265423987, kernel=linear; total time=   1.7s
[CV] END C=22.45070936322433, epsilon=0.09236881442270906, kernel=rbf; total time=   0.9s
[CV] END C=41.87786014729522, epsilon=0.05595908327634237, kernel=rbf; total time=   2.8s
[CV] END C=2.03861429174698

In [18]:
joblib.dump(rnd_search.best_estimator_, "_randomly_optimized_svm_regressor.pkl")

['_randomly_optimized_svm_regressor.pkl']

In [19]:
rnd_search.best_estimator_