In [25]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from collections import Counter
import numpy as np
from sklearn.impute import SimpleImputer

In [63]:
# Placeholder for correlation-based selector
def cor_selector(X, y, num_feats):
    cor_list = []
    for i in X.columns:
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(abs(cor))
    cor_list = np.nan_to_num(cor_list)
    feature_indices = np.argsort(cor_list)[-num_feats:]
    cor_support = [i in feature_indices for i in range(len(X.columns))]
    cor_feature = X.columns[feature_indices].tolist()
    return cor_support, cor_feature

In [65]:
# Chi-square selector
def chi_squared_selector(X, y, num_feats):
    chi_selector = SelectKBest(score_func=chi2, k=num_feats)
    chi_selector.fit(X, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.columns[chi_support].tolist()
    return chi_support, chi_feature

In [67]:
# Recursive Feature Elimination (RFE)
def rfe_selector(X, y, num_feats):
    model = LogisticRegression(solver='lbfgs')
    rfe_selector = RFE(estimator=model, n_features_to_select=num_feats, step=1,verbose=5)
    rfe_selector.fit(X, y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.columns[rfe_support].tolist()
    return rfe_support, rfe_feature


In [69]:
# Logistic Regression Embedded Method
def embedded_log_reg_selector(X, y, num_feats):
    model = LogisticRegression(solver='liblinear', penalty='l2', max_iter=50000)
    model.fit(X, y)
    importance = abs(model.coef_).flatten()
    feature_indices = np.argsort(importance)[-num_feats:]
    embedded_lr_support = [i in feature_indices for i in range(len(X.columns))]
    embedded_lr_feature = X.columns[feature_indices].tolist()
    return embedded_lr_support, embedded_lr_feature

In [71]:
# Random Forest Embedded Method
def embedded_rf_selector(X, y, num_feats):
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X, y)
    importance = model.feature_importances_
    feature_indices = np.argsort(importance)[-num_feats:]
    embedded_rf_support = [i in feature_indices for i in range(len(X.columns))]
    embedded_rf_feature = X.columns[feature_indices].tolist()
    return embedded_rf_support, embedded_rf_feature

In [73]:
# LightGBM Embedded Method
def embedded_lgbm_selector(X, y, num_feats):
    model = LGBMClassifier(n_estimators=500,
                      learning_rate=0.05,
                      num_leaves=32,
                      colsample_bytree=0.2,
                      reg_alpha=3,
                      reg_lambda=1,
                      min_split_gain=0.01,
                      min_child_weight=40)
    model.fit(X, y)
    importance = model.feature_importances_
    feature_indices = np.argsort(importance)[-num_feats:]
    embedded_lgbm_support = [i in feature_indices for i in range(len(X.columns))]
    embedded_lgbm_feature = X.columns[feature_indices].tolist()
    return embedded_lgbm_support, embedded_lgbm_feature

In [75]:
# Preprocessing Function
def preprocess_dataset(dataset): 
    
    # Drop any column that contains missing values
    dataset = dataset.dropna(axis=1)
    
    # Assuming the last column is the target variable
    y = dataset.iloc[:, -1]
    
    # Identify categorical columns and perform one-hot encoding
    X = dataset.iloc[:, :-1]
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)   
    
    return X, y

In [77]:
def autoFeatureSelector(dataset_path, methods=[]):
    # Parameters
    # data - dataset to be analyzed (csv file)
    # methods - various feature selection methods we outlined before, use them all here (list)
    
    # preprocessing
    X, y, num_feats = preprocess_dataset(dataset_path)
    feature_name = list(X.columns)
    
    # Dictionary to store support indicators for each method
    support_dict = {}
    feature_dict = {}
    
    # Run every method we outlined above from the methods list and collect returned best features from every method
    if 'pearson' in methods:
        cor_support, cor_feature = cor_selector(X, y,num_feats)
        support_dict['pearson'] = cor_support
        feature_dict['pearson'] = cor_feature
    if 'chi-square' in methods:
        chi_support, chi_feature = chi_squared_selector(X, y,num_feats)
        support_dict['chi-square'] = chi_support
        feature_dict['chi-square'] = chi_feature
    if 'rfe' in methods:
        rfe_support, rfe_feature = rfe_selector(X, y,num_feats)
        support_dict['rfe'] = rfe_support
        feature_dict['rfe'] = rfe_feature
    if 'log-reg' in methods:
        embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
        support_dict['log-reg'] = embedded_lr_support
        feature_dict['log-reg'] = embedded_lr_feature
    if 'rf' in methods:
        embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
        support_dict['rf'] = embedded_rf_support
        feature_dict['rf'] = embedded_rf_feature
    if 'lgbm' in methods:
        embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
        support_dict['lgbm'] = embedded_lgbm_support
        feature_dict['lgbm'] = embedded_lgbm_feature
    
    # Combine all the above feature list and count the maximum set of features that got selected by all methods
    #### Your Code starts here (Multiple lines)
    # Create dataframe with all selection methods
    feature_selection_df = pd.DataFrame({'Feature':feature_name})
    for method in methods:
        feature_selection_df[method] = support_dict[method]
    
    # Count the total votes for each feature
    feature_selection_df['Total'] = feature_selection_df.iloc[:, 1:].sum(axis=1)
    
    # Sort features by total votes and feature name
    feature_selection_df = feature_selection_df.sort_values(['Total','Feature'], ascending=False)
    
    # Select features with maximum votes
    max_votes = feature_selection_df['Total'].max()
    best_features = feature_selection_df[feature_selection_df['Total'] == max_votes]['Feature'].tolist()
    
    #### Your Code ends here
    return best_features

In [79]:
df = pd.read_csv("heart.csv")

In [81]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [83]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [87]:
best_features = autoFeatureSelector(df, methods=['pearson','chi-square', 'rfe', 'log-reg', 'rf', 'lgbm'],num_output_features=5)
best_features

Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[LightGBM] [Info] Number of positive: 526, number of negative: 499
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 402
[LightGBM] [Info] Number of data points in the train set: 1025, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513171 -> initscore=0.052695
[LightGBM] [Info] Start training from score 0.052695
feature votes result: [('ca', 6), ('cp', 5), ('exang', 4), ('oldpeak', 4), ('thal', 4), ('thalach', 3), ('sex', 2), ('slope', 1), ('chol', 1)]


['ca', 'cp', 'exang', 'oldpeak', 'thal']