In [1]:
import sklearn

print(sklearn.__version__)

0.23.1


### Classification Dataset

In [24]:
from sklearn.datasets import make_classification

x, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
print(x.shape)
print(y.shape)

(1000, 10)
(1000,)


### Regression Dataset

In [7]:
# test regression dataset
from sklearn.datasets import make_regression
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, random_state=1)
# summarize the dataset
print(X.shape, y.shape)

(1000, 10) (1000,)


### Using Coefficients as a feature score

Linear machine learning algorithms fit a model where the prediction is the weighted sum of the input values.

Examples include linear regression, logistic regression, and extensions that add regularization, such as ridge regression and the elastic net.

All of these algorithms find a set of coefficients to use in the weighted sum in order to make a prediction. These coefficients can be used directly as a crude type of feature importance score.

Let’s take a closer look at using coefficients as feature importance for classification and regression. We will fit a model on the dataset to find the coefficients, then summarize the importance scores for each input feature and finally create a bar chart to get an idea of the relative importance of the features

#### Linear Regression Feature Importance

 We can fit Linear Regression model on the regression dataset can retreive the coefficients found for each input feature respectively.
 
 These coeff values can provide as the basis for feature score, but all this have to be done after all the features have the same scales.

In [22]:
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X,y)

importance = model.coef_

for index, value in enumerate(importance):
    
    print('feature: %0d, Score: %.5f' %(index, value))

feature: 0, Score: -0.00000
feature: 1, Score: 12.44483
feature: 2, Score: 0.00000
feature: 3, Score: -0.00000
feature: 4, Score: 93.32225
feature: 5, Score: 86.50811
feature: 6, Score: 26.74607
feature: 7, Score: 3.28535
feature: 8, Score: 0.00000
feature: 9, Score: -0.00000


### Logistic Regression Feature Importance

In [33]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x,y)

importance = model.coef_[0]

for i,v in enumerate(importance):
    
    print('feature: %0d, Score: %.5f' %(i,v))

feature: 0, Score: 0.16320
feature: 1, Score: -0.64301
feature: 2, Score: 0.48497
feature: 3, Score: -0.46190
feature: 4, Score: 0.18432
feature: 5, Score: -0.11978
feature: 6, Score: -0.40602
feature: 7, Score: 0.03772
feature: 8, Score: -0.51785
feature: 9, Score: 0.26540


### Selecting Good Features - Linear Models and Regularization

In [2]:
from sklearn.linear_model import LinearRegression
import numpy as np

In [9]:
# If we want same random values of the data then we reset our seed if not don't reset it, then we'll get different 
# values everytime.

np.random.seed(0)
size = 5000

# Creating a dataset with 3 features
X = np.random.normal(0, 1, (size,3))

y = X[:,0] + 2*X[:,1] + np.random.normal(0,2,size)

lr = LinearRegression()
lr.fit(X,y)

LinearRegression()

In [10]:
lr.coef_

array([ 0.98422873,  1.99522378, -0.04074316])

### Regularized Models

#### L1 / Lasso Regularization

In [16]:
# Example for Lasso on Boston housing dataset

from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston

In [18]:
boston = load_boston()
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [19]:
scaler = StandardScaler()

In [21]:
X = scaler.fit_transform(boston['data'])
X

array([[-0.41978194,  0.28482986, -1.2879095 , ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-0.41734159, -0.48772236, -0.59338101, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.98304761],
       [-0.40776407, -0.48772236,  0.11573841, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [-0.41500016, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.66905833]])

In [30]:
Y = boston['target']
Y

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [26]:
# Names of features in our boston datset

boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [28]:
# Initializing the object lasso

lasso = Lasso(alpha=0.3)

In [31]:
lasso.fit(X,Y)

Lasso(alpha=0.3)

In [32]:
for i,v in zip(boston['feature_names'], lasso.coef_):
    print(i,v)

CRIM -0.24227912487413236
ZN 0.08181900064057382
INDUS -0.0
CHAS 0.5398719150407506
NOX -0.6989125779407368
RM 2.9932299323791325
AGE -0.0
DIS -1.0809132506977173
RAD 0.0
TAX -0.0
PTRATIO -1.7556124903860422
B 0.628315257481857
LSTAT -3.7046328712517838


### Grid Search Parameter Tuning

Grid search is an approach to parameter tuning that will methodically build and evaluate a model for each combination of algorithm parameters specified in a grid.

The recipe below evaluates different alpha values for the Ridge Regression algorithm on the standard diabetes dataset. This is a one-dimensional grid search.

In [2]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [4]:
# load the diabetes dataset

dataset = datasets.load_diabetes()
dataset

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [6]:
# Preparing the values of alpha to test

alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])

In [9]:
# Creating Ridge Regression object fitting and testing the performance of the different values of alpha

model = Ridge()

In [12]:
# Doing grid search using sklearn.model_selection GridSearchCV

grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

In [13]:
# Now training our grid object with the features and the resulting target values resulting from that features

grid.fit(dataset.data, dataset.target)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 0.e+00])})

In [15]:
# Best score

grid.best_score_

0.4823231384163485

In [16]:
grid.best_estimator_.alpha

0.0001

### Random Search Parameter Tuning

In [None]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
from sklearn import 