# Filter Methods

In [1]:
from typing import Sequence, Union, Optional

import numpy as np
import pandas as pd

# For auto-formatting of code
%load_ext lab_black

## Constant Features

* These are features that have just one unique value/label for all the observations of the dataset. 
* These features provide no valuable information. i.e they have no predictive power.
* Constant feature identification and removal is a simple first step toward feature selection and more easily interpretable machine learning models.

<br>

* **Numeric:** The variance can bu used to detect constant features. A constant feature has **zero** variance.
* **Categorical:** The unique labels can be used to detect constant features. If the number of unique labels is **zero**, then it's a unique constant.

In [35]:
lst = ["A", "A", "A", "A", "A", "A", "A", "A", "A", "A"]
size = 100

data = {
    "a1": lst * size,
    "a2": np.zeros(shape=(size * 10)),
    "a3": np.random.randint(1, 10, size=size * 10),
}

df = pd.DataFrame(data)
df.iloc[[7, 28, 85], 0] = ["B", "B", "B"]
df.head()

Unnamed: 0,a1,a2,a3
0,A,0.0,9
1,A,0.0,8
2,A,0.0,6
3,A,0.0,6
4,A,0.0,2


In [36]:
constant_vars = [feat for feat in df.columns if df[feat].nunique() == 1]

constant_vars

['a2']

## Quasi-constant

* Quasi-constant features are those that have the same value for the vast majority of the observations in the dataset. 
* In general, these characteristics provide little, if any, information that a machine learning model can use to predict a target.

```text
e.g. A feaure that has 99% of a particular label.
```

In [37]:
def check_constant_variables(*, data: pd.DataFrame, thresh: float = 0.98) -> dict:
    """This is used to detect the presence of constant/quasi-constant variables."""
    quasi_constant_vars = {}

    for feat in data.columns:
        feats_dict = data[feat].value_counts(normalize=True).to_dict()
        for key, val in feats_dict.items():
            if val >= thresh:
                quasi_constant_vars[f"{feat}({key})"] = val
    return quasi_constant_vars

In [38]:
result = check_constant_variables(data=df, thresh=0.98)
result

{'a1(A)': 0.997, 'a2(0.0)': 1.0}

In [39]:
df["a3"].value_counts(normalize=True)

2    0.140
1    0.122
6    0.120
4    0.110
7    0.108
8    0.105
5    0.099
9    0.098
3    0.098
Name: a3, dtype: float64