In [None]:
# DATA

X_train = []
y_train = []
X_test = []
y_test = []

# **Polynomial Regression**

### How is polynomial regression model trained?

1. Apply `polynomial transformation` on the feature matrix.
2. Learn `linear regression model` (via normal equation or SGD) on the transformed feature matrix.

**Implementation tips :** Make use of pipeline construct for polynomial transformation followed by linear regression estimator

1. Set up polynomial regression model with normal equation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_model = Pipeline([('polynomial_transform', PolynomialFeatures(degree = 2)),
                       ('LR', LinearRegression)])

poly_model.fit(X_train, y_train)

2. Set up polynomial regression model with SGD

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_model = Pipeline([('polynomial_transform', PolynomialFeatures(degree = 2)),
                       ('SGD', SGDRegressor)])

poly_model.fit(X_train, y_train)

### How to use only interaction features for polynomial regression?

[ $𝓧_1$ , $𝓧_2$ ] is transformed to [ 1, $𝓧_1$ , $𝓧_2$ , $𝓧_1$$𝓧_2$ ]

Note that [ $𝓧_1^2$ , $𝓧_2^2$ ] are excluded

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_transform = PolynomialFeatures(degree = 2, interaction_only = True)

# **Regularization**

## RIDGE

### How to perform ridge regularization with specific regularization rate?

**Option 1**

Step1: Instantiate object of `Ridge` estimator

Step2: Set parameter `alpha` to the required regularization rate

`fit`, `score`, `predict` work exactly like other linear regression estimators

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 1e-3)

**Option 2**

Step1: Instantiate object of `SGDRegressor` estimator

Step2: Set parameter `alpha` to the required regularization rate  and `penalty = l2`

In [None]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(alpha = 1e-3 , penalty = 'l2')

### How to search the best regularization parameter for ridge?

**Option 1**

Search for the best regularization rate with built-in cross validation in `RidgeCV` estimator

**Option 2**

Use cross validation with `Ridge` or `SGDRegressor` to search for best regularization
  * Grid search
  * Randomized search

### How to perform ridge regularization in polynomial regression?

Set up a pipeline of polynomial transformation followed by ridge regressor.

In [None]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_model = Pipeline([('polynomial_transform', PolynomialFeatures(degree = 2)),
                       ('ridge', Ridge(alpha = 1e-3))])

poly_model.fit(X_train, y_train)

Instead of `Ridge`, we can use SGDRegressor to get equivalent formulation.

## LASSO

### How to perform LASSO regularization with specific regularization rate?

**Option 1**

Step1: Instantiate object of `Lasso` estimator

Step2: Set parameter `alpha` to the required regularization rate

`fit`, `score`, `predict` work exactly like other linear regression estimators

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 1e-3)

**Option 2**

Step1: Instantiate object of `SGDRegressor` estimator

Step2: Set parameter `alpha` to the required regularization rate  and `penalty = l1`

In [None]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(alpha = 1e-3 , penalty = 'l1')

### How to search the best regularization parameter for lasso?

**Option 1**

Search for the best regularization rate with built-in cross validation in `LassoCV` estimator

**Option 2**

Use cross validation with `Lasso` or `SGDRegressor` to search for best regularization
  * Grid search
  * Randomized search

### How to perform lasso regularization in polynomial regression?

Set up a pipeline of polynomial transformation followed by lasso regressor.

In [None]:
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_model = Pipeline([('polynomial_transform', PolynomialFeatures(degree = 2)),
                       ('lasso', Lasso(alpha = 1e-3))])

poly_model.fit(X_train, y_train)

Instead of `Lasso`, we can use SGDRegressor to get equivalent formulation.

## Ridge and Lasso together

### How to perform both lasso and ridge regularization in polynomial regression?

Set up a pipeline of polynomial transformation followed by the SGDRegressor with `penalty = 'elasticnet'`

In [None]:
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_model = Pipeline([('polynomial_transform', PolynomialFeatures(degree = 2)),
                       ('elasticnet', SGDRegressor(penalty = 'elasticnet', l1_ration = 0.3))])

poly_model.fit(X_train, y_train)

`elasticnet` is a convex combination of L1(Lasso) and L2(Ridge) regularization.

In the above example, we have set `l1_ratio = 0.3` which means `l2_ratio = 1 - l1_ratio = 0.7` . L2 takes higher weightage in this formulation.

# **Hyperparameter Tuning**

### How to recognize hyperparameters in any sklearn estimator?

* `Hyper-parameters` are parameters that are not directly learnt within estimators.

* In `sklearn`, they are passed as arguments to the constructor of the estimator classes.

* For example,

  * `degree` in `PolynomialFeatures`
  * `learning_rate` in `SGDRegressor`

### How to set these hyperparameters?

* Select hyperparameters that results in the best cross validation scores.

* Hyper parameter search consists of:

  * an estimator (regressor or classifier)
  * a parameter space
  * a method for searching or sampling candidates
  * a cross-validation scheme
  * a score function

* We can specify hyperparameter search with these five components.

Two generic HPT approaches implemented in sklearn are:

  * `GridSearchCV` exhaustively considers all parameter combinations for specified values.

  * `RandomizedSearchCV` samples a given number of candidate values from a parameter space with a specified distribution.

In [None]:
# GridSearchCV

param_grid = [
    {'C': [1,10,100,1000],
     'kernel' : ['linear']}
]

In [None]:
# RandomizedSearchCV

param_dist = {
    'average': [True, False],
    'l1_ratio': stats.uniform(0,1),
    'alpha': loguniform(1e-4, 1e0)
}

### What are the differences between grid and randomized search?

Grid Search
  * Specifies exact values of parameters in grid

Randomized Search
  * Specifies distributions of parameter values and values are sampled from those distributions.
  * Computational budget can be chosen independent of number of parameters and their possible values.
  * The budget is chosen in terms of the number of sampled candidates or the number of training iterations specified in `n_iter` argument.

### What data split is recommended for HPT?

**STEP-1**

* Divide training data into `training`, `validation` and `test` sets.

**STEP-2**

* For each combination of hyper-parameter values learn a model with training set.
* This step create multiple models.
* This step can be run in parallel by setting `n_jobs = -1`
* Some parameter combinations may cause failure in fitting one or more folds of data. This may cause the search to fail. Set `error_score = 0` (or np.NaN) to set score for the problematic fold to 0 and complete the search.

**STEP-3**

* Evaluate performance of each model with validation set and select a model with the best evaluation score.

**STEP-4**

* Retrain model with the best hyper-parameter settings on training and validation set combined.

**STEP-5**

* Evaluate the model performance on the test set.

Note : The test set was not used in hyper-parameter search and model retraining. That is why this performance measure is likely to give us true performance measure on the unseen data.




### What are some of the model specific HPT available for regression tasks?

* Some models can fit data for a range of values of some parameter almost as efficiently as fitting the estimator for a single value of the parameter.

* This feature can be leveraged to perform more efficient cross-validation used for model selection of this parameter.

  * linear_model.LassoCV
  * linear_model.RidgeCV
  * linear_model.ElasticNetCV

### How to determine degree of polynomial regression with grid search?

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor

param_grid = [
    {
        'poly__degree' : [2,3,4,5,6,7,8,9]
    }
]

pipeline = Pipeline(steps = [
    ('poly', PolynomialFeatures()),
    ('sgd', SGDRegressor())
])

grid_search = GridSearchCV(pipeline, param_grid, cv = 5, scoring = 'neg_mean_sqaured_error', return_train_score = True)

grid_search.fit(X_train.reshape(-1,1), y_train)

# **California Housing Dataset**

Basic Understanding of Data

In [8]:
from sklearn.datasets import fetch_california_housing

In [9]:
ch = fetch_california_housing(as_frame = True)

Linear Regression on California Housing Dataset

In [23]:
import numpy as np
import pandas as pd

from scipy.stats import loguniform, uniform

from sklearn.dummy import DummyRegressor

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, SGDRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, ShuffleSplit, validation_curve, GridSearchCV, RandomizedSearchCV

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.pipeline import Pipeline

In [24]:
cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 42)

In [25]:
#DATA LOADING AND SPLITTING

features, labels = fetch_california_housing(as_frame = True, return_X_y = True)

com_train_features, X_test, com_train_labels, y_test = train_test_split(features, labels, random_state = 42)

X_train, X_dev, y_train, y_dev = train_test_split(com_train_features, com_train_labels, random_state = 42)