In [1]:
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import  RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Hyperparameter tuning for linear
# Linear Regression is algorithm that analyses a scatter of data points and attempts to find
# a best-fit line that describes analysed data.
# Linear Regression creates this line on x,y axis: y=mx+b   (where y-predictive value, m- coefficient, b-constant/intercept)
# where m=(y2-y1)/(x2-x1)
# and number of attributes correspondes to number of dimensions (2 values- 2 dimencional, 3 values- 3 dim and so on)

# define model
model = linear_model.LinearRegression()
# define evaluation
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42) # n_splits - number of folds, n_repeats - number of times CV will be repeated
# define search space
space = dict()
space['fit_intercept'] = [True, False]
space['positive'] = [True, False]
# define search
search = GridSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# Hyperparameter tuning for KNR
#If we have a variable with undefined class that represents a point on Xy-axis (in 2 dimencions) and is is displaced on some distance
#from a multiple clusters of data points (each cluster represents a defined class), KNN
#alorithm finds the cluster to which undefined variable is most likely related (measuring the distance to the closest defined points, 
#and picking the cluster with the biggest number of close points).
#Number of points the algorithm counts from the nearby clusters is "k" (from the title KNN)
# define model
model = KNeighborsRegressor()
# define evaluation
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1) # n_splits - number of folds, n_repeats - number of times CV will be repeated
# define search space
space = dict()
space['n_neighbors'] = [*range(1,21,1)]         #Number of neighbors to use
space['weights'] = ['uniform', 'distance',]         # All points in each neighborhood are weighted equally or  closer neighbors of a query point 
                                                    # will have a greater influence than neighbors which are further away.
space['algorithm'] = ['ball_tree', 'kd_tree','brute']              #Algorithm used to compute the nearest neighbors, 'auto' decides which one is better
space['leaf_size'] = [*range(20,41,5)]           #Leaf size passed to BallTree or KDTree
space['p'] = [1,2]                  # When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2
space['metric'] = ['minkowski','precomputed']             # Metric to use for distance computation. Default is “minkowski”, which results in the standard Euclidean distance when p = 2
space['n_jobs'] = [1,-1]
# define search
search = RandomizedSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# Hyperparameter tuning for SVR 
# super heavy computationaly, I used manual tuning for SVR. Random Search CV could be considered for this one
# Attempts to create a hyperplane (something straight, a straight line between clusters of data, for example)
# Hyperplane is a line between clusters of data points and is perpendicular to the beam towards hyperplane from
# the closest point of each cluster. The distance between hyperplane and data points from each side should be exactly the same.
# In our case we want to pick hyperplane with the biggest distance to datapoints, which will mean a bigger MARGIN.
# In case if data is randomly distributed on an XY-axis, SVM adds additional dimension to try and draw a hyperplane
# that will devide our data in to groups. This is done with the help of kernel (or just a function, which takes f(x1,x2) and returns x3)
# Also there is a "soft margin", which alows some outliers between the clusters 
# define model
model = SVR(kernel='poly',epsilon=0.02,C=0.033,shrinking=True)
# define evaluation
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1) # n_splits - number of folds, n_repeats - number of times CV will be repeated
# define search space
space = dict()
#space['kernel'] = ['linear','poly','rbf','sigmoid']         # rbf is default
#space['C'] = [*np.arange(0.001,0.1,0.002)]                  # Regularization parameter. The strength of the regularization is inversely proportional to C. 
                                                            #Must be strictly positive. The penalty is a squared l2 penalty
#space['epsilon']=[*np.arange(0.001,0.1,0.002)]              # It specifies the epsilon-tube within which 
                                                            #no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value.
#space['gamma']=[*np.arange(0.1,0.4,0.01)] 
space['coef0']=[*np.arange(0.0,0.01,0.001)] 
# define search
search = GridSearchCV(model, space, scoring='r2', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# Hyperparameter tuning for RandomForest
# define model
model = RandomForestRegressor()
# define evaluation
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1) # n_splits - number of folds, n_repeats - number of times CV will be repeated
# define search space
space = dict()
space['n_estimators'] = [*range(50,200,10)]         # The number of trees in the forest
space['criterion'] = ['squared_error','absolute_error','friedman_mse','poisson']         # The function to measure the quality of a split (about error)
space['max_depth'] = [*range(1,100,5)]              #The maximum depth of the tree
space['min_samples_split'] = [1,2]           # The minimum number of samples required to split an internal node
space['min_samples_leaf'] = [1,2]                 #The minimum number of samples required to be at a leaf node
space['max_features']=[*range(1,100,5)] # The number of features to consider when looking for the best split
space['bootstrap']=[True,False]         # whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree
space['random_state']=[1,None,42]
# define search
search = RandomizedSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)    #grid=exaustive search
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# Hyperparameter tuning for Ridge
# The L2 norm loss function, also known as the least squares error (LSE), 
# is used to minimize the sum of the square of differences between the target value and the estimated value
#   S= E(y-f(x))^2
#less robust to outliers than L1 since error is squared
# define model
model = Ridge()
# define evaluation
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1) # n_splits - number of folds, n_repeats - number of times CV will be repeated
# define search space
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag','saga','lbfgs','sparse_cg']     #Solver to use in the computational routines
space['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]         #Constant that multiplies the L2 term, controlling regularization strength
space['fit_intercept'] = [True, False]             #Whether to fit the intercept for this model. If set to false, no intercept will be used in calculations
     
# define search
search = RandomizedSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# Hyperparameter tuning for Lasso
# The L1 loss, also known as Absolute Error Loss, is the absolute difference between a prediction and the actual value,
#  calculated for each example in a dataset. The aggregation of all these loss values is called the cost function, 
# where the cost function for L1 is commonly MAE (Mean Absolute Error).
#   L1 for 1 sample is : y - f(x), MSE = E(y-f(x))/n_rows
# more robust to the outliers than L2
# define model
model = Lasso()
# define evaluation
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1) # n_splits - number of folds, n_repeats - number of times CV will be repeated
# define search space
space = dict()
space['selection'] = ['cyclic','random']     #If set to ‘random’, a random coefficient is updated every iteration 
                                                #rather than looping over features sequentially by default.
space['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,200,300,400]         #Constant that multiplies the L2 term, controlling regularization strength
space['fit_intercept'] = [True, False]             #Whether to fit the intercept for this model. If set to false, no intercept will be used in calculations
  

# define search
search = RandomizedSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)