In [14]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import (
    chi2,
    f_classif,
    r_regression,
    f_regression,
    mutual_info_classif,
    mutual_info_regression,
    SelectKBest,
    SelectPercentile,
    RFE,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Create dataset

In [2]:
n_features = 20
n_informative = 10
n_repeated = 2
n_redundant = n_features - n_informative - n_repeated

x, y = make_classification(
    n_samples=100,
    n_features=n_features,
    n_informative=n_informative,
    n_redundant=n_redundant,
    n_repeated=n_repeated,
    shuffle=False,
)

cols = (
    [f"informative_{i+1}" for i in range(n_informative)]
    + [f"redundant_{i+1}" for i in range(n_redundant)]
    + [f"repeated_{i+1}" for i in range(n_repeated)]
)
df = pd.DataFrame(x, columns=cols)
np.random.shuffle(cols)
df = df.loc[:, cols]

df.head()

Unnamed: 0,informative_3,redundant_1,redundant_6,repeated_2,redundant_2,redundant_4,informative_6,informative_4,redundant_8,informative_9,informative_7,informative_1,informative_5,redundant_5,informative_10,repeated_1,redundant_7,redundant_3,informative_2,informative_8
0,2.184556,5.836486,-5.694648,1.055499,5.368003,-0.225743,1.055499,-1.824451,7.133486,2.013175,-0.213076,2.217145,-0.730753,-4.406577,-3.058126,7.133486,-1.382256,-2.887262,4.96187,2.232432
1,-1.316643,-1.607137,-3.469298,0.440428,2.24001,1.31268,0.440428,0.292198,4.102107,2.05392,-1.044936,-0.925077,1.584566,3.278208,-1.567449,4.102107,-0.861812,-0.389031,0.600676,0.159714
2,-1.794947,3.020817,-2.783226,3.103997,3.596217,-3.089403,3.103997,0.625332,3.48744,-0.797034,-2.897013,0.689951,2.428056,2.905673,-0.089383,3.48744,-1.158112,-1.255294,3.495078,1.574814
3,-3.029316,-1.74723,-1.173629,2.632483,-1.097512,-1.809024,2.632483,1.398997,2.309135,-0.459959,-2.157722,-1.093022,2.066988,7.718312,-1.340471,2.309135,0.978617,-0.804663,-0.075781,-0.076565
4,3.007401,0.100642,-6.822637,1.326029,-2.345675,0.96723,1.326029,-4.448662,7.809589,3.186244,2.644307,1.041832,-2.793339,-2.012615,-4.443933,7.809589,1.441131,-6.828177,1.124956,1.574575


In [19]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(df)
df = pd.DataFrame(x_scaled, columns=cols)
df.head()

Unnamed: 0,informative_3,redundant_1,redundant_6,repeated_2,redundant_2,redundant_4,informative_6,informative_4,redundant_8,informative_9,informative_7,informative_1,informative_5,redundant_5,informative_10,repeated_1,redundant_7,redundant_3,informative_2,informative_8
0,0.883692,2.094583,-1.261774,-0.062159,1.796188,0.067701,-0.062159,-0.824608,1.559179,0.879267,0.106003,1.9436,-0.771073,-1.407722,-1.542605,1.559179,-0.289995,-0.320333,2.230005,1.118985
1,-0.698372,-0.221825,-0.476071,-0.482217,0.806395,0.502991,-0.482217,0.270826,0.707539,0.898915,-0.303151,-0.304932,0.47281,0.387232,-0.727976,0.707539,-0.08081,0.344131,0.14444,-0.23359
2,-0.9145,1.218365,-0.23384,1.336846,1.23554,-0.742559,1.336846,0.443233,0.534853,-0.475918,-1.214105,0.85076,0.925967,0.300218,0.07976,0.534853,-0.199903,0.113728,1.528571,0.689848
3,-1.472266,-0.265421,0.33446,1.014829,-0.249699,-0.380282,1.014829,0.84363,0.203818,-0.313368,-0.850481,-0.425111,0.731987,1.424318,-0.603937,0.203818,0.658923,0.233584,-0.179048,-0.387777
4,1.255505,0.309625,-1.660033,0.122597,-0.644656,0.405247,0.122597,-2.182722,1.749125,1.444964,1.511421,1.102562,-1.879177,-0.848558,-2.299925,1.749125,0.844823,-1.368514,0.395156,0.689693


In [20]:
df.describe()

Unnamed: 0,informative_3,redundant_1,redundant_6,repeated_2,redundant_2,redundant_4,informative_6,informative_4,redundant_8,informative_9,informative_7,informative_1,informative_5,redundant_5,informative_10,repeated_1,redundant_7,redundant_3,informative_2,informative_8
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,5.329071000000001e-17,-7.438494000000001e-17,1.554312e-16,1.487699e-16,-8.604228e-18,1.9984010000000002e-17,1.487699e-16,-8.243406000000001e-17,-4.662937e-17,1.110223e-18,-5.245804e-17,1.298961e-16,-2.553513e-16,-1.976197e-16,1.204592e-16,-4.662937e-17,4.2188470000000006e-17,-9.103829000000001e-17,5.995204000000001e-17,3.996803e-17
std,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038,1.005038
min,-2.421711,-2.497186,-2.492033,-2.432241,-2.516299,-2.531558,-2.432241,-2.27809,-2.88424,-2.19339,-2.977465,-2.174941,-2.311191,-2.099967,-2.450175,-2.88424,-2.501674,-2.625991,-2.971284,-2.853512
25%,-0.5422872,-0.7940848,-0.6966137,-0.6243523,-0.6445424,-0.6776141,-0.6243523,-0.6612465,-0.7499369,-0.7346884,-0.7547424,-0.7659211,-0.7007722,-0.7350775,-0.6418795,-0.7499369,-0.7073832,-0.636069,-0.6054454,-0.6433846
50%,0.01953073,0.03539551,-0.1217024,0.03088508,-0.1377956,0.06744565,0.03088508,0.06604207,0.06825175,-0.1605225,0.04134477,-0.09065974,0.09645219,-0.08013583,-0.1111352,0.06825175,-0.03728509,0.04673755,-0.05315669,0.03410508
75%,0.6276025,0.5839506,0.690514,0.5697485,0.5158355,0.6858505,0.5697485,0.6530306,0.7673878,0.7590427,0.7517996,0.6440707,0.6689785,0.72079,0.5888214,0.7673878,0.7502902,0.5633161,0.6741944,0.7903057
max,3.024371,2.538559,2.688468,2.675185,2.753627,2.280617,2.675185,2.371951,1.919981,2.090653,2.538136,3.14334,3.079962,2.19806,2.598593,1.919981,2.249432,2.656604,2.30925,1.962273


# Univariate feature selection

In [21]:
class UnivariateFeatureSelection:

    def __init__(self, n_features, problem_type, scoring):
        """
        Custom univariate feature selection wrapper on different
        univariate feature selection models from sklearn.
        :param n_features: SelectPercentile if float, SelectKBest if int
        :param problem_type: classification or regression
        :param scoring: scoring function, string
        """

        # for a given problem type, there are only a few valid scoring methods
        if problem_type == "classification":
            valid_scoring = {
                "chi2": chi2,
                "f_classif": f_classif,
                "mutual_info_classif": mutual_info_classif,
            }
        elif problem_type == "regression":
            valid_scoring = {
                "r_regression": r_regression,
                "f_regression": f_regression,
                "mutual_info_regression": mutual_info_regression,
            }
        else:
            raise Exception(
                "Invalid problem type. Select regression or classification."
            )

        # raise exception if scoring is not valid
        # seach in valid_scoring keys
        if scoring not in valid_scoring.keys():
            raise Exception("Invalid scoring function.")

        # if n_features == int, SelectKBest
        if isinstance(n_features, int):
            self.selection = SelectKBest(valid_scoring[scoring], k=n_features)
        # if n_features == float, SelectPercentile
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(
                valid_scoring[scoring], percentile=round(n_features * 100)
            )
        else:
            raise Exception("Invalid type of feature.")

        self.feature_names = None

    def fit(self, X, y):
        
        # Fit the feature selector
        self.selection.fit(X, y)
        
        # Check if X is a DataFrame for .columns to work
        if isinstance(X, pd.DataFrame):
            self.feature_names = X.columns[self.selection.get_support()]
        else:
            message = "X must be a pandas DF to extract feature names."
            raise Exception(message)
        
        # Save feature names
        self.feature_names = X.columns[self.selection.get_support()]

        return self

    def transform(self, X):
        return self.selection.transform(X)

    def fit_transform(self, X, y):
        # we have to fit first to be able to obtain feature names
        self.fit(X, y)
        return self.transform(X)

    def get_feature_names(self):
        # if fit is not yet called, raise exception
        if self.feature_names is None:
            raise Exception("Must call fit method first.")
        return self.feature_names

In [22]:
scorings = ["f_classif", "mutual_info_classif"]
for scoring in scorings:
    f_selection = UnivariateFeatureSelection(n_features=10, problem_type="classification", scoring=scoring)
    f_selection.fit_transform(df, y)
    print(f"scoring {scoring}")
    print(f"selected features = {sorted(f_selection.get_feature_names())}")
    print("=" * 100)

scoring f_classif
selected features = ['informative_1', 'informative_10', 'informative_2', 'informative_3', 'informative_4', 'informative_8', 'redundant_1', 'redundant_2', 'redundant_4', 'redundant_5']
scoring mutual_info_classif
selected features = ['informative_10', 'informative_2', 'informative_5', 'informative_7', 'informative_8', 'redundant_1', 'redundant_2', 'redundant_5', 'redundant_8', 'repeated_1']


# Recursive feature elimination