**Feature Selection :**

1)Filter Methods :

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine

A)Correlation :

In [2]:
#Convert sklearn datasets to pandas DataFrame
X, y = load_wine(return_X_y=True)
feature_names = load_wine().feature_names

df = pd.concat([pd.DataFrame(X, columns=feature_names), pd.Series(y, name='target' )], axis=1)

In [3]:
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [7]:
corr = df.corr()['target'] #correlation between the columns

In [8]:
corr[corr > 0.5 ]

alcalinity_of_ash    0.517859
target               1.000000
Name: target, dtype: float64

B)Mutual Information :

In [9]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import load_iris


In [10]:
#Load the iris dataset
X, y = load_iris(return_X_y=True)
feature_names = load_iris().feature_names
feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [11]:
#Calculate mutual information
mi_score = mutual_info_classif(X, y)
mi_score

array([0.49984516, 0.22306086, 0.9820591 , 0.98485089])

In [13]:
mi_df = pd.DataFrame({
    'feature': feature_names,
    'mi_score': mi_score,
})
mi_df = mi_df.sort_values(by='mi_score', ascending=False)
mi_df = mi_df.reset_index(drop=True)
mi_df.head()

Unnamed: 0,feature,mi_score
0,petal width (cm),0.984851
1,petal length (cm),0.982059
2,sepal length (cm),0.499845
3,sepal width (cm),0.223061


2)Wrapper Methods :

A)Recursive Feature Elimination (RFE) :

In [14]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [16]:
#Load dataset
X, y = load_breast_cancer(return_X_y=True)
feature_names = load_breast_cancer().feature_names

In [22]:
X.shape

(569, 30)

In [19]:
#Create and fit RFE
rfc = RandomForestClassifier(n_estimators=100, random_state=0)
rfe = RFE(estimator=rfc,n_features_to_select=10,step=1)
rfe = rfe.fit(X, y)

In [21]:
#Print selecting features
selecting_features = [feature_names[i] for i in range(len(feature_names)) if rfe.support_[i]]
print('selecting fetures: ',selecting_features)

selecting fetures:  [np.str_('mean perimeter'), np.str_('mean area'), np.str_('mean concavity'), np.str_('mean concave points'), np.str_('worst radius'), np.str_('worst texture'), np.str_('worst perimeter'), np.str_('worst area'), np.str_('worst concavity'), np.str_('worst concave points')]


In [25]:
#What feature we selected
rfe.support_

array([False, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False])

In [23]:
#Evaluate performance
score = cross_val_score(rfc, X[:, rfe.support_], y, cv=5)
score

array([0.92982456, 0.94736842, 0.97368421, 0.96491228, 0.97345133])

In [24]:
score.mean()

np.float64(0.9578481602235678)

3)Embedded Methods :

A)Lasso Regression :

In [26]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_diabetes


In [27]:
#Load datasets
X, y = load_diabetes(return_X_y=True)
feature_names = load_diabetes().feature_names

In [28]:
#Standardization features
scsler = StandardScaler()
X_scaled = scsler.fit_transform(X)

#Fit Lasso model
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

In [29]:
#Show removed feature with 0
lasso.coef_

array([ -0.27769342, -11.15948797,  24.85518378,  15.2421328 ,
       -26.44813964,  13.72566329,  -0.        ,   7.05557447,
        31.57506171,   3.1584765 ])

In [30]:
#Display selected features
selected_features = [feature for feature, coef in zip(feature_names, lasso.coef_) if coef != 0]
print('Selected features:', selected_features)

Selected features: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's4', 's5', 's6']


4)Hybrid Selection :

In [32]:
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer

In [33]:
#Load dataset
X, y = load_breast_cancer(return_X_y=True)

In [34]:
#Step 1: Filter Methods
filter_selector = SelectKBest(f_classif, k=20)
X_filtered = filter_selector.fit_transform(X, y)

In [35]:
#Step 2: Wrapper Methods
rfc = RandomForestClassifier(n_estimators=100, random_state=1)
wrapper_selector = RFE(estimator=rfc, n_features_to_select=10, step=1)
X_selected = wrapper_selector.fit_transform(X_filtered, y)

In [36]:
print(f'Orginal number of feature : {X.shape[1]}')
print(f'Number of feature after selection : {X_selected.shape[1]}')

Orginal number of feature : 30
Number of feature after selection : 10
