### Feature Selection -  Recursive Feature Elimination (RFE) 

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
from functions import *  
from sklearn.model_selection import train_test_split   

## models 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 

# Important 
from sklearn.feature_selection import RFE  

In [2]:
mriDF = pd.read_csv("data/mri_statistics.csv") 
assessmentDF = pd.read_csv("data/assessment_statistics.csv")
cognitiveScoreDF = pd.read_csv("data/cogniteive_score_statistics.csv") 
labels = pd.read_csv("data/four_labels.csv")
baselineDF = pd.read_csv("data/Baseline_final.csv") 

In [3]:
completeDataDF = cognitiveScoreDF[:] 
completeDataDF = pd.merge(completeDataDF, assessmentDF, on="RID", how="inner")  
completeDataDF = pd.merge(completeDataDF, mriDF, on="RID", how="inner")  
completeDataDF = pd.merge(completeDataDF, baselineDF, on="RID", how="inner") 

In [4]:
cs_columns = cognitiveScoreDF.columns.to_list()[1:]
nt_columns = assessmentDF.columns.to_list()[1:]
mri_columns = mriDF.columns.to_list()[1:]
static_columns = baselineDF.columns.to_list()[2:]

## Parameters 
TARGET_COLUMN = "DX" 
TEST_SIZE     = 0.20 
SCALER_TYPE   = "mm" 
SEEDS = [45, 78, 95, 15, 53, 12, 85, 61, 77, 10] 

mapping          = {"AD": 0, "sMCI": 1, "CN": 2, "pMCI": 3}   
gender_mapping   = {'Male': 1, 'Female': 0}
marriage_mapping = {'Married': 1, 'Widowed': 2, 'Divorced': 3, 'Never married': 0}

dataset       = completeDataDF.drop(['RID'], axis=1) 
dataset['DX'] = dataset['DX'].map(mapping)  
dataset['PTGENDER'] = dataset['PTGENDER'].map(gender_mapping) 
dataset['PTMARRY'] = dataset['PTMARRY'].map(marriage_mapping) 

In [5]:
dataset.shape

(1371, 341)

In [8]:
def select(features, supports): 
    selected_features = [] 
    
    for i in range(len(features)): 
        if supports[i]: 
            selected_features.append(features[i]) 
    
    return selected_features

In [43]:
modality = cs_columns  + nt_columns + mri_columns + static_columns 

In [45]:
##  RFE-RF
selected_features = []   

for seed in SEEDS: 
    X_train, X_test, y_train, y_test = split_dataset(dataset, TARGET_COLUMN, TEST_SIZE, seed)  
    scaled_X_train, scaled_X_test = normalize_dataset(X_train, X_test, SCALER_TYPE)   
    X_train, y_train, X_test, y_test = balance_data(scaled_X_train, y_train, scaled_X_test, y_test) 

    model_rf = RandomForestClassifier(random_state=45) 

    selector = RFE(model_rf, step=1)
    selector = selector.fit(X_train[modality], y_train) 

    rf_selected_features = select(modality, selector.support_)

    print("[RF] Selected features: {}".format(len(rf_selected_features))) 
    
    selected_features.append(rf_selected_features)
    
    print(round(selector.score(X_test[modality], y_test), 3)) 

[RF] Selected features: 170
0.855
[RF] Selected features: 170
0.905
[RF] Selected features: 170
0.855
[RF] Selected features: 170
0.879
[RF] Selected features: 170
0.858
[RF] Selected features: 170
0.879
[RF] Selected features: 170
0.85
[RF] Selected features: 170
0.861
[RF] Selected features: 170
0.847
[RF] Selected features: 170
0.871


In [46]:
len(selected_features)

10

In [49]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3 

In [50]:
selection = selected_features[1] 

cs_features     = intersection(selection, cs_columns) 
nt_features     = intersection(selection, nt_columns) 
mri_features    = intersection(selection, mri_columns) 
static_features = intersection(selection, static_columns)  

cs_features.append('RID') 
nt_features.append('RID') 
mri_features.append('RID') 
static_features.append('RID') 
static_features.append('DX')  

In [58]:
cs_filename = "data/cogniteive_score_statistics.csv"
assessment_filename = "data/assessment_statistics.csv" 
mri_filename = "data/mri_statistics.csv"
baseline_filename =  "data/Baseline_final.csv" 

cognitiveScoreDF[cs_features].to_csv(cs_filename, index=False)
assessmentDF[nt_features].to_csv(assessment_filename, index=False)
mriDF[mri_features].to_csv(mri_filename, index=False) 
baselineDF[static_features].to_csv(baseline_filename, index=False)  