# Boruta Feature Filtering 

For feature selection we use  [GitHub - scikit-learn-contrib/boruta_py: Python implementations of the Boruta all-relevant feature selection method.](https://github.com/scikit-learn-contrib/boruta_py).


Reference: 
- Kursa M., Rudnicki W., "Feature Selection with the Boruta Package" Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = f"{Path.cwd().parent.parent}/data"

##### Load data

In [3]:
# Load Data

series_matrix = pd.read_csv(f'{data_dir}/GSE120584_expression_data.csv')
sample_labels = pd.read_csv(f'{data_dir}/GSE120584_sample_labels.csv')

In [4]:
series_matrix.head()

Unnamed: 0,ID_REF,GSM3403761,GSM3403762,GSM3403763,GSM3403764,GSM3403765,GSM3403766,GSM3403767,GSM3403768,GSM3403769,...,GSM3405352,GSM3405353,GSM3405354,GSM3405355,GSM3405356,GSM3405357,GSM3405358,GSM3405359,GSM3405360,GSM3405361
0,MIMAT0000062,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,3.067457,2.289271,2.613345,4.197017,4.517667,2.018411,3.247889,2.605062,2.260922,2.434129
1,MIMAT0000063,2.307579,2.50538,1.983125,1.560269,3.302472,4.319297,1.964171,0.697365,1.491916,...,3.707958,2.289271,2.613345,4.197017,4.34131,2.018411,3.247889,2.605062,2.260922,2.434129
2,MIMAT0000064,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,2.22253,2.289271,2.613345,4.197017,2.413162,2.018411,3.247889,2.605062,2.260922,2.434129
3,MIMAT0000065,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,1.709051,2.289271,2.613345,4.197017,1.771996,2.018411,3.247889,2.605062,2.260922,2.434129
4,MIMAT0000066,2.307579,1.503044,1.549877,2.232974,4.79347,4.319297,0.575922,-0.180237,1.491916,...,2.841985,2.289271,2.613345,4.197017,3.368444,2.018411,3.247889,2.605062,2.260922,2.731162


In [5]:
sample_labels.head()

Unnamed: 0,Sample ID,LABEL
0,GSM3403761,AD
1,GSM3403762,AD
2,GSM3403763,AD
3,GSM3403764,AD
4,GSM3403765,AD


In [6]:
print(sample_labels.shape, series_matrix.shape)

(1601, 2) (2547, 1602)


In [7]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

Using default paramters for boruta .

For details, See: https://github.com/scikit-learn-contrib/boruta_py

In [8]:
sample_labels

Unnamed: 0,Sample ID,LABEL
0,GSM3403761,AD
1,GSM3403762,AD
2,GSM3403763,AD
3,GSM3403764,AD
4,GSM3403765,AD
...,...,...
1596,GSM3405357,VaD
1597,GSM3405358,VaD
1598,GSM3405359,VaD
1599,GSM3405360,VaD


In [None]:
X = series_matrix.iloc[:, 1:].T.values  # exclude the first column which is probe 'ID_REF'
y = np.array([sample_labels[sample_labels['Sample ID'] == sample_id]['LABEL'] for sample_id in series_matrix.columns[1:]]).flatten()
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (1601, 2547)
y shape: (1601,)


In [10]:
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(estimator=rf)


In [11]:
boruta_selector.fit(X, y)
print(f"Number of selected features: {boruta_selector.n_features_}")

Number of selected features: 30


In [12]:
selected_features = boruta_selector.support_
print(f"Selected features: {np.sum(selected_features)} out of {len(selected_features)}")

Selected features: 30 out of 2547


In [13]:
feature_ranking = boruta_selector.ranking_
print(f"Feature rankings: {feature_ranking}")

Feature rankings: [1896 1158 1587 ...  755 2339 2063]


In [14]:
feature_names = series_matrix['ID_REF'].tolist()
selected_feature_names = [feature_names[i] for i in range(len(selected_features)) if selected_features[i]]
print(f"First 10 selected features: {selected_feature_names[:10]}")

First 10 selected features: ['MIMAT0000245', 'MIMAT0005588', 'MIMAT0005905', 'MIMAT0007349', 'MIMAT0007401', 'MIMAT0012735', 'MIMAT0014984', 'MIMAT0015064', 'MIMAT0016849', 'MIMAT0016889']


In [15]:
series_matrix

Unnamed: 0,ID_REF,GSM3403761,GSM3403762,GSM3403763,GSM3403764,GSM3403765,GSM3403766,GSM3403767,GSM3403768,GSM3403769,...,GSM3405352,GSM3405353,GSM3405354,GSM3405355,GSM3405356,GSM3405357,GSM3405358,GSM3405359,GSM3405360,GSM3405361
0,MIMAT0000062,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,3.067457,2.289271,2.613345,4.197017,4.517667,2.018411,3.247889,2.605062,2.260922,2.434129
1,MIMAT0000063,2.307579,2.505380,1.983125,1.560269,3.302472,4.319297,1.964171,0.697365,1.491916,...,3.707958,2.289271,2.613345,4.197017,4.341310,2.018411,3.247889,2.605062,2.260922,2.434129
2,MIMAT0000064,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,2.222530,2.289271,2.613345,4.197017,2.413162,2.018411,3.247889,2.605062,2.260922,2.434129
3,MIMAT0000065,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,1.709051,2.289271,2.613345,4.197017,1.771996,2.018411,3.247889,2.605062,2.260922,2.434129
4,MIMAT0000066,2.307579,1.503044,1.549877,2.232974,4.793470,4.319297,0.575922,-0.180237,1.491916,...,2.841985,2.289271,2.613345,4.197017,3.368444,2.018411,3.247889,2.605062,2.260922,2.731162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,MIMAT0032116,4.838350,4.921429,5.448523,4.421061,4.437597,6.061879,4.607832,5.020202,4.538719,...,5.620054,5.795805,6.441232,6.514504,5.813278,6.477911,6.291511,5.987467,5.259263,5.490173
2543,MIMAT0033692,5.008100,5.103016,4.882961,4.522823,4.536771,4.319297,4.874431,4.864177,4.461361,...,5.679409,5.473541,5.782202,4.733269,5.505485,5.676203,5.299339,5.305673,5.480905,5.516370
2544,MIMAT0035542,4.013143,4.232799,3.991961,3.199755,3.730172,4.319297,4.378690,2.955253,3.808192,...,3.082827,2.289271,3.950645,4.197017,4.139697,3.276193,3.529750,4.329375,2.472017,2.434129
2545,MIMAT0035703,2.307579,1.503044,1.549877,1.560269,3.179096,4.319297,0.575922,-0.180237,1.491916,...,0.792518,2.289271,2.613345,4.197017,1.771996,2.018411,3.247889,2.605062,2.260922,2.434129


In [16]:
filtered_series_matrix = series_matrix[series_matrix['ID_REF'].isin(selected_feature_names)]
filtered_series_matrix

Unnamed: 0,ID_REF,GSM3403761,GSM3403762,GSM3403763,GSM3403764,GSM3403765,GSM3403766,GSM3403767,GSM3403768,GSM3403769,...,GSM3405352,GSM3405353,GSM3405354,GSM3405355,GSM3405356,GSM3405357,GSM3405358,GSM3405359,GSM3405360,GSM3405361
52,MIMAT0000245,3.247633,3.322302,3.832967,3.099451,3.972978,4.319297,3.27281,2.834856,3.627556,...,4.417478,3.757778,4.574933,4.837825,4.416106,2.755368,3.247889,4.054153,3.463225,4.143556
696,MIMAT0005588,4.666568,4.239831,3.841522,4.553042,4.213964,4.319297,3.707037,4.075516,3.751721,...,4.274165,5.065757,4.614088,4.197017,4.395825,4.321873,4.677839,4.697839,4.651253,4.746008
759,MIMAT0005905,6.424362,5.908149,5.7861,6.202725,6.438405,6.19794,5.694039,6.273598,5.853613,...,4.911331,6.116618,6.584719,6.654529,5.865405,7.085877,6.68185,7.013891,6.7158,7.207587
812,MIMAT0007349,4.314688,4.202207,3.974663,4.258664,4.45852,4.319297,3.877794,4.471617,4.458999,...,4.30267,4.304476,5.247297,4.563546,5.235158,4.856806,4.574527,4.85804,4.695997,5.076922
815,MIMAT0007401,4.302577,3.655424,3.967952,3.724011,3.179096,4.319297,3.670145,3.502049,3.607003,...,4.12301,4.03598,4.282634,4.197017,4.352998,2.321851,4.004131,4.048914,3.743187,4.901239
863,MIMAT0012735,6.650482,6.55862,6.485145,6.697673,6.520311,6.115691,6.205579,6.435542,6.640257,...,6.224163,6.473635,6.911186,6.728856,6.847386,7.076784,7.215211,6.706661,6.854207,7.483779
878,MIMAT0014984,4.238956,3.984927,3.510813,4.021334,3.179096,4.319297,3.834327,4.879638,4.122104,...,4.050979,4.155748,5.123617,4.197017,5.054455,5.177382,4.141063,5.440901,4.897255,4.922
955,MIMAT0015064,8.011701,8.076476,8.778496,7.672666,7.671334,7.842919,8.242926,7.969227,8.677535,...,8.661689,8.600808,8.472142,8.135545,8.481428,7.916001,8.617582,8.022583,8.303287,6.864632
988,MIMAT0016849,10.500005,10.529963,9.664737,10.533419,10.554191,10.237456,10.207759,11.239615,10.262825,...,10.505458,10.916756,10.071904,10.487997,10.465853,11.345804,10.89958,11.241062,11.194294,11.946167
1028,MIMAT0016889,8.523402,8.490272,8.609705,8.901371,8.822575,8.195328,8.853327,9.862205,8.696723,...,9.429104,8.680132,8.97249,9.023195,8.656859,8.494325,8.960655,8.867058,8.610445,8.568192


In [17]:
filtered_series_matrix.shape

(30, 1602)

In [18]:
# Save selected feature names and rankings
selected_features_df = pd.DataFrame({
    'feature_name': selected_feature_names,
    'ranking': [feature_ranking[i] for i in range(len(selected_features)) if selected_features[i]]
})
selected_features_df.to_csv(f'{data_dir}/boruta_selected_features.csv', index=False)

# Save filtered expression matrix
filtered_series_matrix.to_csv(f'{data_dir}/filtered_expression_matrix.csv', index=False)