# FEATURE SELECTION

***

# FILTER METHODS

In [1]:
#libs

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold


In [2]:
from feature_engine.selection import DropConstantFeatures


AttributeError: module 'pandas' has no attribute 'DataFrame'

In [3]:
conda uninstall feature_engine

Collecting package metadata (repodata.json): done
Solving environment: done


  current version: 4.10.1
  latest version: 4.10.3

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /Users/daianeklein/opt/anaconda3/envs/train_in_data

  removed specs:
    - feature_engine


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ipython-7.26.0             |   py39h01d92e1_0         996 KB
    jedi-0.18.0                |   py39hecd8cb5_1         909 KB
    matplotlib-inline-0.1.2    |     pyhd3eb1b0_2          12 KB
    notebook-6.4.1             |   py39hecd8cb5_0         4.1 MB
    ------------------------------------------------------------
                                           Total:         6.0 MB

The following NEW packages will be INSTALLED:

  matplotlib-inline  pkgs/main/noarch::matplotlib-inline-0.1.2-pyhd3eb1

## REMOVING CONSTANT FEATURES

In [2]:
df1 = pd.read_csv('dataset_1.csv')
df1.head()

AttributeError: module 'pandas' has no attribute 'read_csv'

In [7]:
df1.shape

NameError: name 'df1' is not defined

In all feature selection procedures, it is good practice to select the features by examining only the training set. And this is to avoid overfitting.

In [8]:

X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis = 1),
    df1['target'],
    test_size = 0.3,
    random_state = 0)

X_train.shape, X_test.shape

NameError: name 'df1' is not defined

## Using VarianceThreshold from Scikit-learn

The VarianceThreshold from sklearn provides a simple baseline approach to feature selection. It removes all features which variance doesn’t meet a certain threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [None]:
sel = VarianceThreshold(threshold = 0)
sel.fit(X_train) #fit finds the features with zero variance

If we sum over get_support, we get the number of features that are not constant

In [None]:
sum(sel.get_support())

In [None]:
X_train.shape[1] - sum(sel.get_support()) #constante features

In [None]:
# getting constant variables names
constant = X_train.columns[~sel.get_support()]
constant

In [None]:
# checking
for c in constant:
    print(c, X_train[c].unique())

In [None]:
# non-constant feature names
non_const = X_train.columns[sel.get_support()]
non_const

In [None]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

In [None]:
#dataframe
X_train = pd.DataFrame(X_train, columns = non_const)
X_train.sample(3)


## STD from PANDAS

In [None]:
#train and test
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis=1),
    df1['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

In [None]:
# std 
const_features = [f for f in X_train.columns if X_train[f].std() == 0]

# len
len(const_features)


In [None]:
# drops these variables from our datasets
X_train = X_train.drop(labels = const_features, axis = 1)
X_test = X_test.drop(labels = const_features, axis = 1)

print(X_train.shape, X_test.shape)

## MANUAL CODE - CATEGORICAL VARIABLES

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis=1),
    df1['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

In [None]:
# all features to object
X_train = X_train.astype('O')
X_train.dtypes

In [None]:
#using nunique from pandas
const_features = [f for f in X_train.columns if X_train[f].nunique() == 1]

len(const_features)

Note by default nunique() ignores missing values, so if your variables have missing values, use dropna=False within the parameters of nunique().

In [None]:
X_train.drop(labels=const_features, axis=1, inplace=True)
X_test.drop(labels=const_features, axis=1, inplace=True)

X_train.shape, X_test.shape

# Quasi-constant features

In [None]:
df1.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis = 1),
    df1['target'],
    test_size = 0.3,
    random_state = 0)

X_train.shape, X_test.shape

In [None]:
#first, remove constant features

const_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]

X_train.drop(labels = const_features, axis = 1, inplace = True)
X_test.drop(labels = const_features, axis = 1, inplace = True)

X_train.shape, X_test.shape

## Remove quasi-constant features

By default, VarianceThreshold from sklearn removes all zero-variance features.

Here, we will change the default threshold to remove quasi-constant features - features with low-variance

In [None]:
sel = VarianceThreshold(threshold = 0.01)
sel.fit(X_train)

In [None]:
# variance > 0
sum(sel.get_support())

In [None]:
# variance < 0
quasi_const = X_train.columns[~sel.get_support()]
len(quasi_const)

In [None]:
# columns 
quasi_const

In [None]:
# example var_1
X_train['var_1'].value_counts() / np.float64(len(X_train))


In [None]:
# features names
feat_names = X_train.columns[sel.get_support()]

In [None]:
#remove the quasi-constant features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

By removing constant and almost constant features, we reduced the feature space from 300 to 215. This means, that 85 features were removed from this dataset

In [None]:
# trasnform the array into a dataframe

X_train = pd.DataFrame(X_train, columns=feat_names)
X_test = pd.DataFrame(X_test, columns=feat_names)

X_test.head()


### Coding it ourselves

This method, as opposed to the VarianceThreshold, can be used for both **numerical and categorical** variables.

In [None]:
# separate train and test
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis=1),
    df1['target'],
    test_size=0.3,
    random_state=0)

# remove constant features
# using the code from the previous lecture

constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0
]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

In [None]:
# create an empty list
quasi_constant_feat = []

# iterate over every feature
for feature in X_train.columns:

    # find the predominant value, that is the value that is shared
    # by most observations
    predominant = (X_train[feature].value_counts() / np.float64(
        len(X_train))).sort_values(ascending=False).values[0]
    
    if predominant > 0.998:
        quasi_constant_feat.append(feature)
        
len(quasi_constant_feat)

In [None]:
# features names
quasi_constant_feat

In [None]:
# checking
X_train['var_3'].value_counts() / np.float64(len(X_train))

In [None]:
X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

# DUPLICATED FEATURES

In [None]:
# checking null values
[col for col in df1.columns if df1[col].isnull().sum() > 0]

In [None]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(labels=['target'], axis=1), # drop the target
    df1['target'], # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

In [None]:
# remove constant and quasi-constant features first
quasi_constant_feat = []

# iterate over every feature
for feature in X_train.columns:

    # find the predominant value, that is the value that is shared
    # by most observations
    predominant = (X_train[feature].value_counts() / np.float64(
        len(X_train))).sort_values(ascending=False).values[0]

    # evaluate predominant feature: do more than 99% of the observations
    # show 1 value?
    if predominant > 0.998:
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)

In [None]:
# drop these columns from dataset
X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

In [None]:
# check for duplicated features in the training set:

#empty dictionary
duplicated_feat_pairs = {}

# create an empty list to collect features
# that were found to be duplicated
_duplicated_feat = []


# iterate over every feature in our dataset:
for i in range(0, len(X_train.columns)):
    
    # this bit helps me understand where the loop is at:
    if i % 10 == 0:  
        print(i)
    
    # choose 1 feature:
    feat_1 = X_train.columns[i]
    
    # check if this feature has already been identified
    # as a duplicate of another one. If it was, it should be stored in
    # our _duplicated_feat list.
    
    # If this feature was already identified as a duplicate, we skip it, if
    # it has not yet been identified as a duplicate, then we proceed:
    if feat_1 not in _duplicated_feat:
    
        # create an empty list as an entry for this feature in the dictionary:
        duplicated_feat_pairs[feat_1] = []

        # now, iterate over the remaining features of the dataset:
        for feat_2 in X_train.columns[i + 1:]:

            # check if this second feature is identical to the first one
            if X_train[feat_1].equals(X_train[feat_2]):

                # if it is identical, append it to the list in the dictionary
                duplicated_feat_pairs[feat_1].append(feat_2)
                
                # and append it to our monitor list for duplicated variables
                _duplicated_feat.append(feat_2)
                
                

In [None]:
#list of duplicated features
len(_duplicated_feat)

In [None]:
_duplicated_feat

In [None]:
duplicated_feat_pairs

In [None]:
# print the features with its duplicates

# iterate over every feature in our dict:
for feat in duplicated_feat_pairs.keys():
    
    # if it has duplicates, the list should not be empty:
    if len(duplicated_feat_pairs[feat]) > 0:

        # print the feature and its duplicates:
        print(feat, duplicated_feat_pairs[feat])
        print()

In [None]:
X_train[X_train['var_37'] != 0][['var_37', 'var_148']].head(10)

In [None]:
X_train = X_train[duplicated_feat_pairs.keys()]
X_test = X_test[duplicated_feat_pairs.keys()]

X_train.shape, X_test.shape

# Constant and Quasi-constant features with Feature-engine