Intro to hyperparameter tuning

In [10]:
# import the necessary packages
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.fixes import loguniform
from sklearn.svm import SVR
import pandas as pd

Create config class

In [2]:
class Config:
    # specify path to dataset
    CSV_PATH = "dataset/abalone_train.csv"

    # specify column names of dataframe
    COLS = ["Length","Diameter","Height","Whole weight",
            "Shucked weight","Viscera weight","Shell weight","Age"]

    # instantiate objects of the config class
config = Config()

Load dataset, separate features/labels

In [3]:
print("[INFO] loading data...")
dataset = pd.read_csv(config.CSV_PATH, names=config.COLS)
dataX = dataset[dataset.columns[:-1]]
dataY = dataset[dataset.columns[-1]]

[INFO] loading data...


Split train / test

In [4]:
(trainX, testX, trainY, testY) = train_test_split(dataX,
	dataY, random_state=3, test_size=0.15)

Standardize the feature values computing the mean

In [5]:
scaler = StandardScaler()
trainX = scaler.fit_transform(trainX)
testX = scaler.transform(testX)

Train without hyperparameters and eval

In [6]:
# train the model with *no* hyperparameter tuning
print("[INFO] training our support vector regression model")
model = SVR()
model.fit(trainX, trainY)

# evaluate our model using R^2-score (1.0 is the best value)
print("[INFO] evaluating...")
print("R2: {:.2f}".format(model.score(testX, testY)))

[INFO] training our support vector regression model
[INFO] evaluating...
R2: 0.55


Tune with grid search

In [7]:
# initialize model and define the space of the hyperparameters to
# perform the grid-search over
model = SVR()
kernel = ["linear", "rbf", "sigmoid", "poly"]
tolerance = [1e-3, 1e-4, 1e-5, 1e-6]
C = [1, 1.5, 2, 2.5, 3]
grid = dict(kernel=kernel, tol=tolerance, C=C)

In [8]:
# initialize a cross-validation fold and perform a grid-search to
# tune the hyperparameters
print("[INFO] grid searching over the hyperparameters...")
cvFold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
gridSearch = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1,
	cv=cvFold, scoring="neg_mean_squared_error")
searchResults = gridSearch.fit(trainX, trainY)

# extract the best model and evaluate it
print("[INFO] evaluating...")
bestModel = searchResults.best_estimator_
print("R2: {:.2f}".format(bestModel.score(testX, testY)))

[INFO] grid searching over the hyperparameters...
[INFO] evaluating...
R2: 0.56


Tune with randomized search

In [11]:
# initialize model and define the space of the hyperparameters to
# perform the randomized-search over
model = SVR()
kernel = ["linear", "rbf", "sigmoid", "poly"]
tolerance = loguniform(1e-6, 1e-3)
C = [1, 1.5, 2, 2.5, 3]
grid = dict(kernel=kernel, tol=tolerance, C=C)

Cross validation folder

In [12]:
# initialize a cross-validation fold and perform a randomized-search
# to tune the hyperparameters
print("[INFO] grid searching over the hyperparameters...")
cvFold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
randomSearch = RandomizedSearchCV(estimator=model, n_jobs=-1,
	cv=cvFold, param_distributions=grid,
	scoring="neg_mean_squared_error")
searchResults = randomSearch.fit(trainX, trainY)

# extract the best model and evaluate it
print("[INFO] evaluating...")
bestModel = searchResults.best_estimator_
print("R2: {:.2f}".format(bestModel.score(testX, testY)))

[INFO] grid searching over the hyperparameters...
[INFO] evaluating...
R2: 0.56
