# Featurefilter Examples

This notebooks shows a number of concise examples for common use cases.

### Remove columns with too many NA values

In [9]:
import numpy as np
import pandas as pd

from featurefilter import NaFilter

df = pd.DataFrame({'A': [0, np.nan, np.nan],
                   'B': [0, 0, np.nan]})

na_filter = NaFilter(max_na_ratio=0.5)
na_filter.columns_to_drop = ['A']
na_filter.fit_transform(df)

The NA ratio of column 'A' (0.6667) is above the threshold of 0.5000


Unnamed: 0,B
0,0.0
1,0.0
2,


### Remove columns with too low or high variance

In [10]:
import pandas as pd

from featurefilter import VarianceFilter

df = pd.DataFrame({'A': [0., 1.], 'B': [0., 0.]})

variance_filter = VarianceFilter()
variance_filter.fit_transform(df)

The variance of column 'B' (0.0000) is below the threshold of 0.0000


Unnamed: 0,A
0,0.0
1,1.0


### Remove columns with too high correlation to the target variables

In [11]:
import pandas as pd

from featurefilter import TargetCorrelationFilter

df = pd.DataFrame({'A': [0, 0], 'B': [0, 1], 'Y': [0, 1]})

target_correlation_filter = TargetCorrelationFilter(target_column='Y')
target_correlation_filter.fit_transform(df)

The absolute correlation of column 'B' (1.0000) to the target column 'Y' is above the threshold of 0.9500


Unnamed: 0,A,Y
0,0,0
1,0,1


### Remove columns using generalized linear models (GLMs)

In [12]:
import pandas as pd

from featurefilter import GLMFilter

df = pd.DataFrame({'A': [0, 0, 1, 1],
                   'B': [0, 1, 0, 1],
                   'Y': [0, 0, 1, 1]})

glm_filter = GLMFilter(target_column='Y', top_features=1)
glm_filter.fit_transform(df)

Relative feature importances:
   A 1.0000
   B 0.0000


Unnamed: 0,A,Y
0,0,0
1,0,0
2,1,1
3,1,1


### Remove columns using tree-based models

In [13]:
import pandas as pd

from featurefilter import TreeBasedFilter

df = pd.DataFrame({'A': [0, 0, 1, 1],
                   'B': [0, 1, 0, 1],
                   'Y': ['a', 'a', 'b', 'b']})

tree_based_filter = TreeBasedFilter(target_column='Y',
                                    categorical_target=True,
                                    top_features=1)
tree_based_filter.fit_transform(df)

A 1.0
B 0.0
['B' 'A']


Unnamed: 0,A,Y
0,0,a
1,0,a
2,1,b
3,1,b


### Remove columns using multiple filters combined with scikit-learn's Pipeline API

In [14]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

from featurefilter import NaFilter, VarianceFilter

df = pd.DataFrame({'A': [0, np.nan, np.nan],
                   'B': [0, 0, 0],
                   'C': [0, np.nan, 1]})

pipeline = Pipeline([
    ('na_filter', NaFilter(max_na_ratio=0.5)),
    ('variance_filter', VarianceFilter())
])

pipeline.fit_transform(df)

The NA ratio of column 'A' (0.6667) is above the threshold of 0.5000
The variance of column 'B' (0.0000) is below the threshold of 0.0000


Unnamed: 0,C
0,0.0
1,
2,1.0


### Remove columns using existing selectors provided by scikit-learn

In [15]:
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression

from featurefilter import SklearnWrapper

df = pd.DataFrame({'A': [0, 0, 1, 1],
                   'B': [0, 1, 0, 1],
                   'Y': [0, 0, 1, 1]})

model = RFECV(LinearRegression(),
              min_features_to_select=1,
              cv=3)
selector = SklearnWrapper(model, target_column='Y')
selector.fit_transform(df)

Unnamed: 0,A,Y
0,0,0
1,0,0
2,1,1
3,1,1
