# Filter Methods

In [1]:
from typing import Sequence, Union, Optional

import numpy as np
import pandas as pd

# For auto-formatting of code
%load_ext lab_black

## Constant Features

* These are features that have just one unique value/label for all the observations of the dataset. 
* These features provide no valuable information. i.e they have no predictive power.
* Constant feature identification and removal is a simple first step toward feature selection and more easily interpretable machine learning models.

<br>

* **Numeric:** The variance can bu used to detect constant features. A constant feature has **zero** variance.
* **Categorical:** The unique labels can be used to detect constant features. If the number of unique labels is **zero**, then it's a unique constant.

In [2]:
lst = ["A", "A", "A", "A", "A", "A", "A", "A", "A", "A"]
size = 100

data = {
    "a1": lst * size,
    "a2": np.zeros(shape=(size * 10)),
    "a3": np.random.randint(1, 10, size=size * 10),
}

df = pd.DataFrame(data)
df.iloc[[7, 28, 85], 0] = ["B", "B", "B"]
df.head()

Unnamed: 0,a1,a2,a3
0,A,0.0,9
1,A,0.0,5
2,A,0.0,5
3,A,0.0,2
4,A,0.0,2


In [3]:
constant_vars = [feat for feat in df.columns if df[feat].nunique() == 1]

constant_vars

['a2']

## Quasi-constant

* Quasi-constant features are those that have the same value for the vast majority of the observations in the dataset. 
* In general, these characteristics provide little, if any, information that a machine learning model can use to predict a target.

```text
e.g. A feaure that has 99% of a particular label.
```

In [4]:
def check_constant_variables(*, data: pd.DataFrame, thresh: float = 0.98) -> dict:
    """This is used to detect the presence of constant/quasi-constant variables."""
    quasi_constant_vars = {}

    for feat in data.columns:
        feats_dict = data[feat].value_counts(normalize=True).to_dict()
        for key, val in feats_dict.items():
            if val >= thresh:
                quasi_constant_vars[f"{feat}({key})"] = val
    return quasi_constant_vars

In [5]:
result = check_constant_variables(data=df, thresh=0.98)
result

{'a1(A)': 0.997, 'a2(0.0)': 1.0}

### Using Feature-Engine

```python
from feature_engine.selection import DropConstantFeatures
sel = DropConstantFeatures(tol=1, variables=None, missing_values='raise')

sel.fit(X)
```

In [6]:
from feature_engine.selection import DropConstantFeatures

# Drop Constant Features
feat_selector = DropConstantFeatures(tol=1, variables=None, missing_values="raise")

feat_selector.fit(df)

In [7]:
dir(feat_selector)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_new_feature_names',
 '_check_feature_names',
 '_check_n_features',
 '_check_variable_number',
 '_confirm_variables',
 '_confirm_variables_docstring',
 '_get_feature_names_in',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_remove_feature_names',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sklearn_auto_wrap_output_keys',
 '_validate_data',
 '_validate_params',
 'confirm_variables',
 'feature_names_in_',
 'features_to_drop_',
 'fit',
 'fit_transform',
 'get_feature_names_out',
 'get_params',
 'get_support',
 'missing_values',
 'n_features_in_',
 'set_output',
 

In [10]:
feats_to_drop = feat_selector.features_to_drop_
print(f"Dropped features: {feats_to_drop}\n")

Dropped features: ['a2']



In [11]:
df_transformed = feat_selector.transform(df)
df_transformed.head()

Unnamed: 0,a1,a3
0,A,9
1,A,5
2,A,5
3,A,2
4,A,2


In [13]:
from feature_engine.selection import DropConstantFeatures

# Drop Quasi-Constant Features
THRESH = 0.98
feat_selector = DropConstantFeatures(tol=THRESH, variables=None, missing_values="raise")

df_transformed = feat_selector.fit_transform(df)

feats_to_drop = feat_selector.features_to_drop_
print(f"Dropped features: {feats_to_drop}\n")

df_transformed.head()

Dropped features: ['a1', 'a2']



Unnamed: 0,a3
0,9
1,5
2,5
3,2
4,2


## Dropping Correlated Features

* Correlation Feature Selection evaluates feature subsets based on the following hypothesis: 
  * Good feature subsets contain features that are highly correlated with the target but are uncorrelated with one another.

There are 2  approaches used in theis notebook:
1. The first is a brute force function that finds correlated features without any additional information.
2. The second procedure identifies groups of correlated features, which we can then investigate to determine which ones to keep and which to discard.

In [None]:
from feature_engine.selection import (
    DropConstantFeatures,
    DropDuplicateFeatures,
    SmartCorrelatedSelection,
)