# ADNI - Feature selection routine

In [1]:
from  IPython.core.display  import HTML
with open('project.css') as css:
    styles = css.read()
HTML(styles)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

In [3]:
# Data preparation to test our function

#Loading data
data = pd.read_csv('patient_data.csv', sep='\t', index_col=0)

#Creating dummies from categorical data
data = pd.concat([data, pd.get_dummies(data['PTGENDER'], prefix='PTGENDER')], axis=1)
data = pd.concat([data, pd.get_dummies(data['PTETHCAT'], prefix='PTETHCAT')], axis=1)
data = pd.concat([data, pd.get_dummies(data['PTRACCAT'], prefix='PTRACCAT')], axis=1)
data = pd.concat([data, pd.get_dummies(data['PTMARRY'], prefix='PTMARRY')], axis=1)

#creating binary column to test the feature selection function
data['has_AD'] = data['DX_bl'].apply(lambda x: 1 if x == 'AD' else 0)

#Dropping irrelevant/converted to dummies columns
vars_to_drop = ['RID', 'PTID', 'update_stamp', 'DX_lv', 'FSVERSION', 'PTGENDER', 'VISCODE', 
               'PTETHCAT', 'PTRACCAT', 'PTMARRY', 'EXAMDATE', 'ORIGPROT', 'COLPROT']
data.drop(vars_to_drop, axis=1, inplace=True)

#Removing strings from the ABETA, TAU and PTAU columns
#If the string indicates higher than x, x is assumed as the true value
#If the string indicates lower than x, x/2 (midpoint between x and 0) is assumed as the true value
data['ABETA'] = data['ABETA'].replace('>1700', 1700)
data['ABETA'] = data['ABETA'].replace('<200', 100)
data['TAU'] = data['TAU'].replace('<8', 4)
data['TAU'] = data['TAU'].replace('<80', 40)
data['TAU'] = data['TAU'].replace('>1300', 1300)
data['PTAU'] = data['PTAU'].replace('<8', 4)
data['PTAU'] = data['PTAU'].replace('>120', 120)

#Removing rows with NaN and Inf (just for testing purposes)
data = data.replace([np.inf, -np.inf], np.nan)
data = data.replace(np.nan, 0)

#Train-test separation
data_train, data_test = train_test_split(data, test_size=0.3)

#Creating X and y
X_train = data_train.drop(['has_AD', 'DX_bl'], axis=1)
X_test = data_test.drop(['has_AD', 'DX_bl'], axis=1)
y_train = data_train['has_AD']
y_test = data_test['has_AD']

In [4]:
def feature_selector(X, y, tol=1e-8, resp_type='categorical'):
    """
    This function performs feature selection using Lasso regularization
    X: the dataset of predictiors (dataset is assumed to be already scaled/trimmed)
    y: the response variable
    tol: value below which coefficients are considered zero
    resp_type: can assume 2 values: categorical or continuous - indicating the type of response variable
    The function returns a list containing the numbers of columns that were selected in the predictors dataset
    """
    if resp_type == 'categorical':
        model = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', max_iter=10000)
        model.fit(X, y)
        coefs = model.coef_
        result = [i for i in range(len(coefs[0])) if np.abs(coefs[0][i])>tol]
        return result
    elif resp_type == 'continuous':
        model = LassoCV(cv=5)
        model.fit(X, y)
        coefs = model.coef_
        result = [i for i in range(len(coefs)) if np.abs(coefs[i])>tol]
        return result
    else:
        return 'Error. Response type not recognized.'

In [5]:
#Running a test of the features that are the best predictors of AD in the first visit
teste = feature_selector(X_train, y_train)
teste

[0, 6, 7, 37, 38, 39, 40, 41, 42, 43, 44]

In [6]:
X_train.columns[teste]

Index(['SITE', 'ABETA', 'TAU', 'IMAGEUID', 'Ventricles', 'Hippocampus',
       'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV'],
      dtype='object')

In [7]:
#Another test on a continuous variable
random_y = np.random.normal(10, 1, X_train.shape[0])

test2 = feature_selector(X_train, random_y, tol=1e-30, resp_type='continuous')
test2

[40]

In [8]:
X_train.columns[test2]

Index(['WholeBrain'], dtype='object')