Feature Selection

In [1]:
from sklearn.feature_selection import SelectKBest, f_classif

С указание количества

In [3]:
def select_features(X, y, num_features):
    '''
    The function uses the SelectKBest class from sklearn to perform 
    feature selection using the f_classif function, 
    which computes the ANOVA F-value between the feature and the target. 
    It then returns the selected features as a new 2D array or matrix.
    '''
    # Perform feature selection
    selector = SelectKBest(f_classif, k=num_features)
    selector.fit(X, y)

    # Get the selected features
    selected_features = selector.get_support(indices=True)

    # Return the selected features
    return X[:,selected_features], selected_features

RFE - лучше всего использовать это

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def select_features_rfe(X, y, num_features):
    # Create a Logistic Regression model
    model = LogisticRegression()
    # Create an RFE selector
    selector = RFE(model, num_features)
    # Fit the selector to the data
    selector.fit(X, y)
    # Get the selected features
    features = selector.transform(X)
    return features


С указанием персентиля

In [None]:
from sklearn.feature_selection import SelectPercentile, f_classif

def select_features(X, y, percentile):
    '''
    This function is similar to the previous example, but it uses the
     SelectPercentile class from sklearn instead of SelectKBest. 
     The SelectPercentile class allows you to select a certain percentile 
     of the highest scoring features, rather than a specific number of features.
    '''
    # Perform feature selection
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit(X, y)

    # Get the selected features
    selected_features = selector.get_support(indices=True)

    # Return the selected features
    return X[:,selected_features], selected_features

# selected_features = select_features(X, y, 20)


На основе фиче импортанс

In [4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

def select_features(X, y):
    # Build a random forest classifier
    clf = RandomForestClassifier()

    # Perform feature selection using the random forest classifier
    selector = SelectFromModel(clf)
    selector.fit(X, y)

    # Get the selected features
    selected_features = selector.get_support(indices=True)

    # Return the selected features
    return X[:,selected_features], selected_features


Подбор гиперпараметров

In [None]:
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2']}
param_grid_brf = {'n_estimators': [50, 100, 150, 200],
              'max_depth': [None, 5, 10, 15]}
param_grid_xgb = {'max_depth': [3, 5, 7, 9],
              'learning_rate': [0.1, 0.3, 0.5, 0.7],
              'n_estimators': [50, 100, 150, 200]}
param_grid_rusboost = {'n_estimators': [50, 100, 150, 200],
              'learning_rate': [0.1, 0.3, 0.5, 0.7],
              'max_depth': [3, 5, 7, 9],
              'random_state': [42]}



In [None]:
from sklearn.model_selection import GridSearchCV

def select_hyperparameters(model, X, y, param_grid):
    # Create a grid search object
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)

    # Fit the grid search object to the data
    grid_search.fit(X, y)

    # Return the best hyperparameters
    return grid_search.best_params_


In [None]:
param_distributions_lr = {'C': np.logspace(-3, 3, 7),
                       'penalty': ['l1', 'l2']}
param_distributions_brf = {'n_estimators': np.arange(50, 201, 50),
                       'max_depth': [None, 5, 10, 15]}
param_distributions_xgb = {'max_depth': np.arange(3, 10, 2),
                       'learning_rate': np.linspace(0.1, 1, 5),
                       'n_estimators': np.arange(50, 201, 50),
                       'random_state': [42]}
param_distributions_rusboost = {'n_estimators': np.arange(50, 201, 50),
                       'learning_rate': np.linspace(0.1, 1, 5),
                       'max_depth': np.arange(3, 10, 2),
                       'random_state': [42]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

def select_hyperparameters(model, X, y, param_distributions, n_iter):
    # Create a random search object
    random_search = RandomizedSearchCV(model, param_distributions, n_iter=n_iter, cv=5, n_jobs=-1, verbose=2)

    # Fit the random search object to the data
    random_search.fit(X, y)

    # Return the best hyperparameters
    return random_search.best_params_


In [None]:
# rusboost = RUSBoostClassifier()

# # Select the best hyperparameters for the RUSBoost model
# best_params = select_hyperparameters(rusboost, X, y, param_distributions_rusboost, n_iter=10)