In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
# fetch a regression dataset
data = fetch_california_housing() 
X = data["data"]
col_names = data["feature_names"] 
y = data["target"]


In [3]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [4]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [5]:
data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [6]:
df = pd.DataFrame(X, columns=col_names)

In [7]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [10]:
from sklearn.feature_selection import chi2, f_classif, f_regression
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile


class UnivariateFeatureSelection:
    def __init__(self, n_features, problem_type, scoring):
        """
        Custom univariate feature selection wrapper on
        different univariate feature selection models from scikit-learn.
        :param n_features: SelectPercentile if float else SelectKBest
        :param problem_type: classification or regression
        :param scoring: scoring function, string
        """
        if problem_type == "classification":
            valid_scoring = {
                "f_classif": f_classif,
                "chi2": chi2,
                "mutual_info_classif": mutual_info_classif
            }
        else:
            valid_scoring = {
                "f_regression": f_regression,
                "mutual_info_regression": mutual_info_regression
            }

        if scoring not in valid_scoring:
            raise Exception("Invalid scoring function")

        if isinstance(n_features, int):
            self.selection = SelectKBest(valid_scoring[scoring], k=n_features)
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(valid_scoring[scoring], percentile=int(n_features * 100))
        else:
            raise Exception("Invalid type of feature")

    def fit(self, X, y):
        return self.selection.fit(X, y)

    def transform(self, X):
        return self.selection.transform(X)

    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)

In [20]:
ufs = UnivariateFeatureSelection(
    n_features=0.5, problem_type="regression", scoring="f_regression"
)

In [21]:
ufs

<__main__.UnivariateFeatureSelection at 0x7fe1c84d5650>

In [22]:
ufs.fit(X, y)


SelectPercentile(percentile=50,
                 score_func=<function f_regression at 0x7fe1c8be8680>)

In [23]:
X_transformed = ufs.transform(X)

In [24]:
X_transformed

array([[ 8.3252    , 41.        ,  6.98412698, 37.88      ],
       [ 8.3014    , 21.        ,  6.23813708, 37.86      ],
       [ 7.2574    , 52.        ,  8.28813559, 37.85      ],
       ...,
       [ 1.7       , 17.        ,  5.20554273, 39.43      ],
       [ 1.8672    , 18.        ,  5.32951289, 39.43      ],
       [ 2.3886    , 16.        ,  5.25471698, 39.37      ]])

In [25]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

在贪心特征选择算法中，每次迭代都需要遍历所有特征，以便找到当前迭代中最能提高模型性能的特征。这是贪心算法的核心思想：在每一步选择当前最优的选项，而不考虑全局最优。

具体来说，以下是每次迭代遍历所有特征的原因和逻辑：

详细解释
寻找当前最优特征：

在每次迭代中，算法需要找到一个特征，该特征在加入到当前已选择的特征集合后，能够最大程度地提高模型的性能。
为了找到这个特征，必须评估每一个尚未被选择的特征，看看它们在当前上下文中的表现如何。
评估每个特征的贡献：

对于每个特征，算法将其加入到当前已选择的特征集合中，形成一个新的特征子集。
然后，使用这个新的特征子集训练模型，并评估模型的性能（例如，通过计算AUC）。
通过这种方式，算法可以确定每个特征在当前上下文中的贡献。
选择最佳特征：

在遍历所有特征并评估它们的贡献后，算法选择贡献最大的特征，将其加入到已选择的特征集合中。
这个过程重复进行，直到满足停止条件（例如，模型性能不再提高）。


在每次迭代中，for feature in range(num_features) 确实会遍历所有特征（0 到 3）。
但是，通过 if feature in good_features: continue 这行代码，我们会跳过已经被选择的特征。因此，虽然我们遍历了所有特征，但实际上只会评估那些尚未被选择的特征。

In [17]:
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn.datasets import make_classification


class GreedyFeatureSelection:
    """
    A simple and custom class for greedy feature selection.
    You will need to modify it quite a bit to make it suitable for your dataset.
    """

    def evaluate_score(self, X, y):
        """
        This function evaluates model on data and returns Area Under ROC Curve (AUC)
        NOTE: We fit the data and calculate AUC on same data. WE ARE OVERFITTING HERE.
        But this is also a way to achieve greedy selection. k-fold will take k times longer.
        If you want to implement it in really correct way, calculate OOF AUC and return mean AUC over k folds.
        This requires only a few lines of change and has been shown a few times in this book.
        :param X: training data
        :param y: targets
        :return: overfitted area under the roc curve
        """
        model = linear_model.LogisticRegression()
        model.fit(X, y)
        predictions = model.predict_proba(X)[:, 1]
        auc = metrics.roc_auc_score(y, predictions)
        return auc

    def _feature_selection(self, X, y):
        """
        This function does the actual greedy selection
        :param X: data, numpy array
        :param y: targets, numpy array
        :return: (best scores, best features)
        """
        good_features = []
        best_scores = []

        num_features = X.shape[1]

        while True:
            this_feature = None
            best_score = 0

            for feature in range(num_features):
                print("feature:", feature)
                if feature in good_features:
                    continue

                selected_features = good_features + [feature]
                xtrain = X[:, selected_features]
                score = self.evaluate_score(xtrain, y)

                if score > best_score:
                    this_feature = feature
                    best_score = score

            if this_feature != None:
                good_features.append(this_feature)
                best_scores.append(best_score)

            if len(best_scores) > 2:
                if best_scores[-1] < best_scores[-2]:
                    break

        # return best scores and good features
        # why do we remove the last data point?
        # This condition is checking if the last score in best_scores is less than the second last score. If it is,
        # then the while loop is broken. This is a stopping criterion for the greedy feature selection process.
        # The reason for this is because the greedy feature selection process is iterative and adds one feature at a time
        # to the model. It calculates the score (in this case, AUC) after each addition. If the addition of a new feature does not
        # improve the score (i.e., the latest score is less than the previous score), it indicates that the newly added feature is
        # not contributing positively to the model. Therefore, the process stops, effectively not including the last feature in the "good features" list.
        # This is a common approach in greedy algorithms to prevent overfitting and to ensure that only beneficial
        # features are included in the model.
        print(good_features)
        return best_scores[:-1], good_features[:-1]

    def __call__(self, X, y):
        """
        Call function will call the class on a set of arguments
        """
        scores, features = self._feature_selection(X, y)
        return X[:, features], scores

In [18]:
X, y = make_classification(n_samples=1000, n_features=20)
X_transformed, scores = GreedyFeatureSelection()(X, y)

feature: 0
feature: 1
feature: 2
feature: 3
feature: 4
feature: 5
feature: 6
feature: 7
feature: 8
feature: 9
feature: 10
feature: 11
feature: 12
feature: 13
feature: 14
feature: 15
feature: 16
feature: 17
feature: 18
feature: 19
feature: 0
feature: 1
feature: 2
feature: 3
feature: 4
feature: 5
feature: 6
feature: 7
feature: 8
feature: 9
feature: 10
feature: 11
feature: 12
feature: 13
feature: 14
feature: 15
feature: 16
feature: 17
feature: 18
feature: 19
feature: 0
feature: 1
feature: 2
feature: 3
feature: 4
feature: 5
feature: 6
feature: 7
feature: 8
feature: 9
feature: 10
feature: 11
feature: 12
feature: 13
feature: 14
feature: 15
feature: 16
feature: 17
feature: 18
feature: 19
feature: 0
feature: 1
feature: 2
feature: 3
feature: 4
feature: 5
feature: 6
feature: 7
feature: 8
feature: 9
feature: 10
feature: 11
feature: 12
feature: 13
feature: 14
feature: 15
feature: 16
feature: 17
feature: 18
feature: 19
feature: 0
feature: 1
feature: 2
feature: 3
feature: 4
feature: 5
feature: 6
fea

In [19]:
X

array([[ 1.76908871, -0.59587174,  0.85989271, ..., -1.91075983,
        -1.06057849, -0.42173324],
       [ 0.39145874, -0.007875  , -0.66699314, ..., -0.40541964,
         0.61792922,  0.01337042],
       [-1.07304067,  0.3991555 , -0.37143796, ...,  1.18124684,
        -0.30969679,  0.15309376],
       ...,
       [ 2.16014037, -1.39733884,  4.02014128, ..., -2.54064236,
         0.33385536, -1.04450173],
       [-1.69507305, -0.19104926, -0.03936788, ...,  1.60084896,
         1.03609208,  0.32576556],
       [ 1.01040285,  1.33517853, -0.06930271, ..., -0.81519897,
        -1.18842374,  0.81875016]])

In [20]:
X.shape

(1000, 20)

In [21]:
X_transformed

array([[-0.02356715, -1.06271494, -0.42173324, ..., -2.2855429 ,
        -1.06057849, -1.08677995],
       [-0.10065406, -0.15613707,  0.01337042, ...,  1.20253604,
         0.61792922, -1.53027908],
       [-0.10797369,  0.74581983,  0.15309376, ..., -1.01486513,
        -0.30969679, -0.68988734],
       ...,
       [ 1.11025194, -2.24066544, -1.04450173, ..., -1.21628685,
         0.33385536, -0.36899756],
       [ 1.28485203, -0.02682442,  0.32576556, ..., -1.79511068,
         1.03609208, -0.13501338],
       [-1.52904622,  0.64784308,  0.81875016, ..., -2.84524908,
        -1.18842374,  0.92798829]])

In [22]:
X_transformed.shape

(1000, 13)