# Feature selection
This notebook explores feature selection. It uses the boston house price dataset built into Sklearn.

## Imports

In [33]:
# Core libraries
import pandas as pd

# Sklearn processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

## Load data

In [34]:
# Load Boston housing data set
boston = pd.read_csv("boston.csv")

## Split into X and y

In [35]:
# Define the X (input) and y (target) features
X = boston.drop("MEDV", axis=1)
y = boston["MEDV"]

## Scale features to same range

In [36]:
# Rescale the input features
scaler = MinMaxScaler(feature_range=(0,1))
X_ = scaler.fit_transform(X)
X = pd.DataFrame(X_, columns=X.columns)

  return self.partial_fit(X, y)


## Review features

In [40]:
# View the input features
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,0.0,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.0,0.208015,0.287234,0.08968
1,0.000236,0.0,0.242302,0.0,0.17284,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,0.20447
2,0.000236,0.0,0.242302,0.0,0.17284,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.063466
3,0.000293,0.0,0.06305,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.033389
4,0.000705,0.0,0.06305,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,0.099338


In [41]:
# View the variance
X.var()

CRIM       0.009347
 ZN        0.054394
INDUS      0.063242
CHAS       0.064513
NOX        0.056850
RM         0.018124
AGE        0.084039
DIS        0.036665
RAD        0.143320
TAX        0.103450
PTRATIO    0.053044
LSTAT      0.038828
dtype: float64

## Find and remove low variance features

In [42]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

(339, 12)
(167, 12)


In [15]:
# Perform feature selection using a variance threshold
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(0.02))
sel.fit(X_train)

print("Feature selection", sel.get_support())
print("Selected features:", list(X.columns[sel.get_support()]))
print("Removed features:", list(X.columns[~sel.get_support()]))

Feature selection [False  True  True  True  True False  True  True  True  True  True  True]
Selected features: [' ZN ', 'INDUS ', 'CHAS', 'NOX', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
Removed features: ['CRIM', 'RM']


In [16]:
# Transform (remove low variance features)
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

In [17]:
print(X_train.shape)
print(X_test.shape)

(339, 10)
(167, 10)


## Select features using K-best

In [18]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

(339, 12)
(167, 12)


In [19]:
# Perform feature selection using a univariate statistical test
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif      # use this for classification tasks
from sklearn.feature_selection import f_regression   # use this for regression tasks

kbest = SelectKBest(score_func=f_regression, k=3)
kbest.fit(X_train, y_train)

print("Feature selection", kbest.get_support())
print("Feature scores", kbest.scores_)
print("Selected features:", list(X.columns[kbest.get_support()]))
print("Removed features:", list(X.columns[~kbest.get_support()]))

Feature selection [False False False False False  True False False False False  True  True]
Feature scores [ 71.7505991   45.3094539  102.27204507  12.96777535  75.75687056
 442.09927992  46.82483075  22.32450311  54.40234107  94.37168391
 109.47144894 384.84276122]
Selected features: ['RM', 'PTRATIO', 'LSTAT']
Removed features: ['CRIM', ' ZN ', 'INDUS ', 'CHAS', 'NOX', 'AGE', 'DIS', 'RAD', 'TAX']


In [20]:
# Transform (remove features not selected)
X_train = kbest.transform(X_train)
X_test = kbest.transform(X_test)

In [21]:
print(X_train.shape)
print(X_test.shape)

(339, 3)
(167, 3)


## Remove highly correlated features

In [22]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

(339, 12)
(167, 12)


In [23]:
# Inspect the correlations
X_train.corr()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
CRIM,1.0,-0.206492,0.424996,-0.072714,0.474588,-0.259427,0.370112,-0.396235,0.690909,0.633815,0.306822,0.544767
ZN,-0.206492,1.0,-0.505246,-0.002166,-0.504834,0.314499,-0.555141,0.653629,-0.300758,-0.295111,-0.40683,-0.395578
INDUS,0.424996,-0.505246,1.0,0.021036,0.747521,-0.391447,0.632544,-0.690915,0.563693,0.69625,0.385761,0.596453
CHAS,-0.072714,-0.002166,0.021036,1.0,0.02337,0.090445,0.058438,-0.06721,-0.067436,-0.102532,-0.151995,-0.067241
NOX,0.474588,-0.504834,0.747521,0.02337,1.0,-0.334604,0.718207,-0.762189,0.593436,0.648528,0.16622,0.591311
RM,-0.259427,0.314499,-0.391447,0.090445,-0.334604,1.0,-0.220623,0.219946,-0.199327,-0.286785,-0.340969,-0.640725
AGE,0.370112,-0.555141,0.632544,0.058438,0.718207,-0.220623,1.0,-0.746673,0.435364,0.481458,0.22825,0.595197
DIS,-0.396235,0.653629,-0.690915,-0.06721,-0.762189,0.219946,-0.746673,1.0,-0.479377,-0.504713,-0.253426,-0.485031
RAD,0.690909,-0.300758,0.563693,-0.067436,0.593436,-0.199327,0.435364,-0.479377,1.0,0.89745,0.450225,0.493573
TAX,0.633815,-0.295111,0.69625,-0.102532,0.648528,-0.286785,0.481458,-0.504713,0.89745,1.0,0.446179,0.539611


In [24]:
# Function to list features that are correlated
# Adds the first of the correlated pair only (not both)
def correlatedFeatures(dataset, threshold):
    correlated_columns = set()
    correlations = dataset.corr()
    for i in range(len(correlations)):
        for j in range(i):
            if abs(correlations.iloc[i,j]) > threshold:
                correlated_columns.add(correlations.columns[i])
    return correlated_columns

In [25]:
# Get a set of correlated features, based on threshold correlation of 0.85
cf = correlatedFeatures(X_train, 0.85)
cf

{'TAX'}

In [26]:
# Remove the correlated features
X_train = X_train.drop(cf, axis=1)
X_test = X_test.drop(cf, axis=1)

In [27]:
print(X_train.shape)
print(X_test.shape)

(339, 11)
(167, 11)


## Select features using Recusive Feature Elimination (RFE)

In [28]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

(339, 12)
(167, 12)


In [29]:
# Feature selection using Recursive Feature Elimimation
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# Create a model
model = LinearRegression()

# Select the best 3 features according to RFE
rfe = RFE(model, 3)
rfe.fit(X_train, y_train)

print("Feature selection", rfe.support_)
print("Feature ranking", rfe.ranking_)
print("Selected features:", list(X.columns[rfe.support_]))

Feature selection [False False False False False  True False False False False  True  True]
Feature ranking [ 4  7 10  8  3  1  9  2  6  5  1  1]
Selected features: ['RM', 'PTRATIO', 'LSTAT']


In [30]:
# Transform (remove features not selected)
X_train = rfe.transform(X_train)
X_test = rfe.transform(X_test)

In [31]:
print(X_train.shape)
print(X_test.shape)

(339, 3)
(167, 3)
