In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import joblib
%store -r X_features_full
%store -r X_features_poly_full
%store -r y_labels_full

X_features = X_features_full
X_features_poly = X_features_poly_full
y_labels = y_labels_full
from sklearn.metrics import mean_absolute_percentage_error


In [2]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


svm_lin_reg = SVR(kernel="linear",epsilon=0.001,C=10)
svm_lin_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_lin_reg.predict(X_features[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03916463599574806

In [3]:
joblib.dump(svm_lin_reg, "svm_lin_reg.pkl")

['svm_lin_reg.pkl']

In [4]:
svm_poly2_reg = SVR(kernel="poly",degree=4,epsilon=0.001,C=10)
svm_poly2_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_poly2_reg.predict(X_features[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.08264127584439501

In [5]:
joblib.dump(svm_poly2_reg, "svm_poly2_reg.pkl")

['svm_poly2_reg.pkl']

In [6]:
svm_poly3_reg = SVR(kernel="poly",degree=3,epsilon=0.001,C=10)
svm_poly3_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_poly3_reg.predict(X_features[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.05839869114548517

In [7]:
joblib.dump(svm_poly3_reg, "svm_poly3_reg.pkl")

['svm_poly3_reg.pkl']

In [8]:
svm_poly_reg = SVR(kernel="linear",epsilon=0.001,C=10)
svm_poly_reg.fit(X_features_poly[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_poly_reg.predict(X_features_poly[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03816509570093616

In [9]:
joblib.dump(svm_poly_reg, "svm_poly_reg.pkl")

['svm_poly_reg.pkl']

In [10]:
svm_rbf_reg = SVR(kernel="rbf",C=1,epsilon=0.001)
svm_rbf_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_rbf_reg.predict(X_features[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03594178376013724

In [11]:
joblib.dump(svm_rbf_reg, "svm_rbf_reg.pkl")

['svm_rbf_reg.pkl']

In [12]:
svm_rbf_poly_reg = SVR(kernel="rbf",C=1,epsilon=0.001)
svm_rbf_poly_reg.fit(X_features_poly[0], y_labels[0])
visc_predictions = np.exp(np.power(svm_rbf_poly_reg.predict(X_features_poly[1]),2))-0.0012
svm_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_mse

0.03701918637687342

In [13]:
joblib.dump(svm_rbf_poly_reg, "svm_rbf_poly_reg.pkl")

['svm_rbf_poly_reg.pkl']

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit


split_index = [-1]*len(X_features[0]) + [0]*len(X_features[1])
X = np.concatenate((X_features[0], X_features[1]), axis=0)
y = np.concatenate((y_labels[0], y_labels[1]), axis=0)
pds = PredefinedSplit(test_fold = split_index)

param_grid = [
        {'kernel': ['linear'], 'C': [0.1,1.,10., 30.]},
        {'kernel': ['rbf'], 'C': [0.1,1.0, 3.0, 10., 30.],
         'epsilon': [0.001, 0.005, 0.01, 0.05, 0.1, 1,1.5]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=pds, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X, y)

Fitting 1 folds for each of 39 candidates, totalling 39 fits
[CV] END ...............................C=0.1, kernel=linear; total time=   0.4s
[CV] END ...............................C=1.0, kernel=linear; total time=   1.0s
[CV] END ..............................C=10.0, kernel=linear; total time=   4.9s
[CV] END ..............................C=30.0, kernel=linear; total time=  11.9s
[CV] END ...................C=0.1, epsilon=0.001, kernel=rbf; total time=   3.6s
[CV] END ...................C=0.1, epsilon=0.005, kernel=rbf; total time=   3.3s
[CV] END ....................C=0.1, epsilon=0.01, kernel=rbf; total time=   2.8s
[CV] END ....................C=0.1, epsilon=0.05, kernel=rbf; total time=   1.3s
[CV] END .....................C=0.1, epsilon=0.1, kernel=rbf; total time=   0.5s
[CV] END .......................C=0.1, epsilon=1, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, epsilon=1.5, kernel=rbf; total time=   0.0s
[CV] END ...................C=1.0, epsilon=0.001

In [15]:
grid_search.best_estimator_

In [16]:
joblib.dump(grid_search.best_estimator_, "optimized_svm_regressor.pkl")

['optimized_svm_regressor.pkl']

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

split_index = [-1]*len(X_features[0]) + [0]*len(X_features[1])
X = np.concatenate((X_features[0], X_features[1]), axis=0)
y = np.concatenate((y_labels[0], y_labels[1]), axis=0)
pds = PredefinedSplit(test_fold = split_index)
# see https://docs.scipy.org/doc/scipy/reference/stats.html
# for `expon()` and `reciprocal()` documentation and more probability distribution functions.

# Note: gamma is ignored when kernel is "linear"
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(1, 100),
        'epsilon': expon(scale=1.0),
    }

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=pds, scoring='neg_mean_squared_error',
                                verbose=2)
rnd_search.fit(X, y)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV] END C=84.90194657973237, epsilon=1.2756678902824727, kernel=linear; total time=   0.0s
[CV] END C=7.030422251370759, epsilon=0.41160269354109286, kernel=rbf; total time=   0.0s
[CV] END C=3.834404803127021, epsilon=0.31820970869944004, kernel=linear; total time=   0.0s
[CV] END C=7.063159572316437, epsilon=0.35700874037643077, kernel=linear; total time=   0.0s
[CV] END C=39.99033508341051, epsilon=2.278984893522787, kernel=linear; total time=   0.0s
[CV] END C=15.437925583860988, epsilon=0.6118034250925121, kernel=rbf; total time=   0.0s
[CV] END C=1.4116002704670323, epsilon=0.7574061050399753, kernel=linear; total time=   0.0s
[CV] END C=71.7143173830146, epsilon=0.44413943258292754, kernel=rbf; total time=   0.0s
[CV] END C=1.6600481046794247, epsilon=0.9285108718222016, kernel=linear; total time=   0.0s
[CV] END C=82.94271478050182, epsilon=0.5008870094855961, kernel=linear; total time=   0.0s
[CV] END C=1.3524357123

In [18]:
joblib.dump(rnd_search.best_estimator_, "_randomly_optimized_svm_regressor.pkl")

['_randomly_optimized_svm_regressor.pkl']

In [19]:
rnd_search.best_estimator_