In [1]:
from ffselect.subset import MinSubsetSelection

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE

This is the example usage of `MinSubsetSelection` function to remove redundant features. It may be useful for removing multiple polynomial features, where simple feature subset selection algorithms fail due to extremely large numbers of features. `MinSubsetSelection` algorithm has $\mathcal{O}(n \log{n})$ complexity.

Warning: this algorithm can be applied only if

$\text{loss}(f(x_1, ..., x_{i-1}, x_{i+1}, ..., x_n)) \le \text{loss}(f(x_1, ..., x_{i-1}, x_i, x_{i+1}, ..., x_n)) \forall x_i \text{ - redundant}$

Current dataframe contains completely random data (no distribution)

In [3]:
df = pd.DataFrame([[12, 32, 2, 45, 32, 12], [19, 2, 84, 12, 45, 21], [83, 12, 56, 45, 12, 45], [54, 234, 653, 213, 34, 657], [43, 45, 76, 12, 56, 21], [76, 23, 76, 34, 76, 87]], columns=['A', 'B', 'C', 'D', 'E', 'target'])

In [4]:
df

Unnamed: 0,A,B,C,D,E,target
0,12,32,2,45,32,12
1,19,2,84,12,45,21
2,83,12,56,45,12,45
3,54,234,653,213,34,657
4,43,45,76,12,56,21
5,76,23,76,34,76,87


In [5]:
# Split DataFrame into 2 train and test
df_train_pre, df_test_pre = train_test_split(df, test_size=0.3, random_state=32)

y_train = np.asarray(df_train_pre['target'])
y_test = np.asarray(df_test_pre['target'])

# Filter out features columns ['A', 'B', 'C', 'D', 'E']
x_features = df_train_pre.drop(columns='target').columns.values

# Scale features to X~N(0,1)
norm = StandardScaler()
df_train_pre[x_features] = norm.fit_transform(df_train_pre[x_features])
df_test_pre[x_features] = norm.transform(df_test_pre[x_features])

# Add polynomial features
poly = PolynomialFeatures(degree=2)
X_train = poly.fit_transform(df_train_pre[x_features])
X_test = poly.transform(df_test_pre[x_features])
poly_features = poly.get_feature_names_out(x_features)

# Creating resulting dataframes
df_train = pd.DataFrame(X_train, columns=poly_features)
df_test = pd.DataFrame(X_test, columns=poly_features)
df_train['target'] = y_train
df_test['target'] = y_test

In [6]:
df_train

Unnamed: 0,1,A,B,C,D,E,A^2,A B,A C,A D,...,B C,B D,B E,C^2,C D,C E,D^2,D E,E^2,target
0,1.0,-1.41695,-0.54618,-0.735452,-0.429,-0.096275,2.007746,0.773909,1.042098,0.607871,...,0.401689,0.234311,0.052583,0.54089,0.315509,0.070805,0.184041,0.041302,0.009269,12
1,1.0,1.37759,-0.770254,-0.531527,-0.429,-1.379936,1.897754,-1.061094,-0.732226,-0.590986,...,0.409411,0.330439,1.062901,0.282521,0.228025,0.733473,0.184041,0.591992,1.904222,45
2,1.0,-0.196799,-0.400532,-0.455999,-0.848466,1.444119,0.03873,0.078824,0.08974,0.166977,...,0.182642,0.339838,-0.578416,0.207935,0.3869,-0.658517,0.719895,-1.225286,2.085479,21
3,1.0,0.236158,1.716966,1.722978,1.706465,0.032092,0.055771,0.405476,0.406896,0.402996,...,2.958294,2.929942,0.0551,2.968654,2.940203,0.055293,2.912024,0.054763,0.00103,657


In [7]:
df_test

Unnamed: 0,1,A,B,C,D,E,A^2,A B,A C,A D,...,B C,B D,B E,C^2,C D,C E,D^2,D E,E^2,target
0,1.0,1.102072,-0.647013,-0.455999,-0.568822,2.72778,1.214562,-0.713055,-0.502544,-0.626882,...,0.295037,0.368035,-1.764909,0.207935,0.259382,-1.243865,0.323558,-1.551621,7.440783,87
1,1.0,-1.141432,-0.882291,-0.425788,-0.848466,0.738105,1.302866,1.007074,0.486008,0.968466,...,0.375669,0.748594,-0.651223,0.181295,0.361267,-0.314276,0.719895,-0.626257,0.544799,21


In [8]:
def fit_regression(data: tuple[pd.DataFrame, pd.DataFrame], features: list[str], target: str) -> float:
    """
    Fit the regression over the dummy model
    :param data: Input data, (df_train, df_test)
    :param features: Features to fit
    :param target: Target column name
    :return: MAE loss
    """
    x_train, y_train = data[0][features], data[0][target]
    x_test, y_test = data[1][features], data[1][target]

    reg = LinearRegression().fit(x_train, y_train)

    return MAE(y_test, reg.predict(x_test))


In [9]:
# Get DataFrame features
columns = list(df_train.drop(columns='target').columns.values)

In [10]:
min_mae, features = MinSubsetSelection(data=(df_train, df_test), target='target', fit_function=fit_regression, features=columns)

[1/21]: feature: E^2, loss: 66.680162 -> 42.558103 (-24.122058)
[2/21]: feature: A^2, loss: 42.558103 -> 31.291051 (-11.267052)
[3/21]: feature: D^2, loss: 31.291051 -> 14.483662 (-16.807389)
[4/21]: feature: C, loss: 14.483662 -> 6.833877 (-7.649785)
[5/21]: feature: E, loss: 6.833877 -> 6.201756 (-0.632121)
[6/21]: feature: D E, loss: 6.201756 -> 3.203635 (-2.998121)
[7/21]: feature: B E, loss: 3.203635 -> 1.548838 (-1.654796)
[8/21]: feature: A D, loss: 1.548838 -> 1.474965 (-0.073873)
[9/21]: feature: 1, loss: 1.474965 -> 1.474965 (-0.000000)


In [11]:
# Dropped features
set(columns) - set(features)

{'1', 'A D', 'A^2', 'B E', 'C', 'D E', 'D^2', 'E', 'E^2'}