# Chapter 10
## Dimensionality reduction using feature selection

### 10.1 Thresholding numerical feature variance

In [None]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

In [None]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

thresholder = VarianceThreshold(threshold=.5)
features_high_variance = thresholder.fit_transform(features)
features_high_variance[0:3]

### 10.2 Thresholding binary feature variance

In [None]:
features = [
    [0, 1, 0],
    [0, 1, 0],
    [0, 1, 0],
    [0, 1, 0]
]

thresholder = VarianceThreshold(threshold = (.75 * (1 - .75)))
thresholder.fit_transform(features)

### 10.3 Handling highly correlated features   

In [None]:
import pandas as pd
import numpy as np

In [None]:
features = np.array(
    [[1, 1, 1], 
     [2, 2, 0],
     [3, 3, 1],
     [4, 4, 0],
     [5, 5, 1],
     [6, 6, 0],
     [7, 7, 1],
     [8, 7, 0],
     [9, 7, 1]]
)

dataframe = pd.DataFrame(features)
corr_mat = dataframe.corr().abs()

upper = corr_mat.where(
    np.triu(np.ones(corr_mat.shape), k = 1).astype(bool)
)

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

dataframe.drop(dataframe.columns[to_drop], axis = 1).head(3)

In [None]:
dataframe.corr()

### 10.4 Removing irrelevant features for classification

In [None]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

In [None]:
iris = load_iris()
features = iris.data.astype(int)
target = iris.target

chi2_selector = SelectKBest(chi2, k = 2)
features_kbest = chi2_selector.fit_transform(features, target)

print('Original number of features: ', features.shape[1])
print('Reduced number of features: ', features_kbest.shape[1])

In [None]:
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

print('Original number of features: ', features.shape[1])
print('Reduced number of features: ', features_kbest.shape[1])

In [None]:
from sklearn.feature_selection import SelectPercentile

fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

print('Original number of features: ', features.shape[1])
print('Reduced number of features: ', features_kbest.shape[1])

### 10.5 Recursively eliminating features

In [None]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

In [None]:
warnings.filterwarnings(
    action = 'ignore', module = 'scipy', message = 'înternal gelsd'
)

In [None]:
features, target = make_regression(
    n_samples = 10000,
    n_features = 100,
    n_informative = 2,
    random_state = 1
)

ols = linear_model.LinearRegression()

rfecv = RFECV(estimator = ols, step = 1, scoring = 'neg_mean_squared_error')
rfecv.fit(features, target)
rfecv.transform(features)


In [None]:
rfecv.n_features_

In [None]:
rfecv.support_

In [None]:
rfecv.ranking_