In [1]:
# load important modules
import random
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split

%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load dataset
data = pd.DataFrame(pd.read_excel('data/sarcopenia.xlsx'))
# Genero 1: male, 2: female => change to 0: male, 1: female
# SI: 1, NO: 2 => change to NO: 0, YES: 1
data['Genero'] = data['Genero'].replace({1: 0, 2: 1})

# overview of data
print("Features: ", len(data.columns))
print("\"Misses\" in:")
print(data.loc[:,data.isna().any()].info())
print("Indexes of rows with misses:", data[data.isna().any(axis=1)].index)
print("Columns:\n",data.columns)
data.describe()

# treating nans
# Marcha: meaning of 0? If 0 = NaN: 9 NaN can be imputed with mean
# HB: meaning of 0? Guess 0 = NaN
data['Hb'].fillna(0, inplace=True)
data['Marcha'].fillna(data['Marcha'].mean(), inplace=True)
data['Marcha'] = data['Marcha'].replace({0:data['Marcha'].mean()})


Features:  84
"Misses" in:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Marcha  248 non-null    float64
 1   Hb      249 non-null    float64
dtypes: float64(2)
memory usage: 4.0 KB
None
Indexes of rows with misses: Int64Index([24, 191, 210], dtype='int64')
Columns:
 Index(['Folio', 'Genero', 'Edad', 'Escolaridad', 'Letrado', 'EdoCivil',
       'Cuidador', 'Religiòn', 'Residencia', 'Ocupacion', 'Economìa',
       'Manutencion', 'Visiòn', 'CorreccionVisual', 'Audiciòn',
       'CorreccionAuditiva', 'HAS', 'DMII', 'OA', 'OSTEOP', 'GASTRITIS',
       'DEPRESION', 'CARDIOOLOGIA', 'TNCM', 'HIPOTIROIDISMO', 'HIPERTENSION',
       'CANCER', 'EPOC', 'DISLIPIDEMIA', 'IRC', 'InsfHepatica', 'MED1', 'MED2',
       'MED3', 'MED4', 'MED5', 'MED6', 'MED7', 'MED8', 'MED9', 'MED10',
       'Tabaquismo', 'Alcoholismo', 'Drogas', 'ExpBiomasa', 'MMSE', 'MMSEx',
       'M

In [3]:
# usefull functions and declaration of variables which will be used later
drop_col = []

# drop features stored as strings in feats from dataframe df
def drop_feat(df, feats):
    for feature in feats:
        if feature in df.columns: df.drop(feature, axis=1, inplace=True)
    return df

# get name of all binary features from dataframe df
def get_binary_features(df):
    return [feature for feature in df.columns 
     if len(data[feature].value_counts()) <= 2]

In [4]:
# Folio = ID = index+1: no information
if "Folio" not in drop_col: drop_col.append("Folio")

# get all binary features
binary_features = get_binary_features(data)
data[binary_features[1:]] = data[binary_features[1:]].replace({2:0})

# apply variance threshold for binary
sel_bin = VarianceThreshold(threshold=(0.95*(1-0.95)))
sel_bin.fit_transform(data[binary_features])
mask = sel_bin.get_support()

for i in range(len(binary_features)):
        if not mask[i] and binary_features[i] not in drop_col: drop_col.append(binary_features[i])

print("Binary features with too low variance:\n", drop_col)

# get features which are not binary and have at least 95% of the same value
for i in data.columns:
    if (i not in binary_features) and (i not in drop_col) and (data[i].value_counts(dropna=False).tolist()[0] > 0.95*data.shape[0]):
        drop_col.append(i)

print("To drop because of low variance or entropy:\n", drop_col)

# Dependencia (1-4) ??
# Charlson (2-10) ??
# MM & IMM Meaning?
# FuerzaPrension => not binary
# Vision described for 1-3, but data show 1-5 (thoughts: encoding from -2 to 2)
# Audicion described for 1-3, but data show 1-5
# GDS (0-5): Meaning/Encoding unclear

# categorical features: 
# Escolaridad (0-16), Letrado (1-3), EdoCivil (1-5), Cuidador (1-6), 
# Religion (1-5), Residencia (1-5), Ocupacion(1-4), Economia (1-3),
# Manutencion (1-5), Vision (1-3/1-5), CorreccionVisual (0-5), 
# Audicion (1-3/1-4), CorreccionAuditiva (0-4), MED1, MED2, MED3, MED4,
# MED5, MED6, MED7, MED8, MED9, MED10, Tabaquismo (1-3), Alcoholismo (1-3)
# Drogas (1-3)

# continous feature, which are encoded as categoricals/binary: => delete 2 versions?
# MMSE/MMSEx/MMSEcodif, Barthel/BarthelX/BarthelAR, Norton/NortonX/NortonAlto, Lawton/LawtonX, MNA/MNAx


# LawtonAR: check relation to LawtonX, maybe some errors?
print(pd.crosstab(data['LawtonX'], data['LawtonAR'], margins='all'))
# MNAAR: check realtion to MNAx, maybe some errors?
print(pd.crosstab(data['MNAx'], data['MNAAR'], margins='all'))
print(str(data[(data['LawtonAR']==0) & (data['LawtonX'] == 3)].index)
      +"\n"+
      str(data[(data['LawtonAR']==1) & (data['LawtonX']==5)].index)
      +"\n"+
     str(data[(data['MNAAR']==0) & (data['MNAx'] == 3)].index))

Binary features with too low variance:
 ['Folio', 'GASTRITIS', 'CANCER', 'InsfHepatica', 'Ùlceras']
To drop because of low variance or entropy:
 ['Folio', 'GASTRITIS', 'CANCER', 'InsfHepatica', 'Ùlceras', 'Residencia', 'MED10']
LawtonAR    0    1  All
LawtonX                
1           0   31   31
2           0   31   31
3           1   38   39
4          51    0   51
5          96    2   98
All       148  102  250
MNAAR    0    1  All
MNAx                
1      114    0  114
2        0  120  120
3        2   14   16
All    116  134  250
Int64Index([118], dtype='int64')
Int64Index([36, 222], dtype='int64')
Int64Index([28, 214], dtype='int64')


In [5]:
# get correlation matrix
corr = data.corr('pearson')

# get features with strong relation in medicine/psychology
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6107969/
very_high_correlation = [] # 0.7 to 1.0
for i in range(len(corr)):
    for j in range(i):
        pearson_corr = np.abs(corr.iloc[i,j])
        if 0.7 <= pearson_corr:
            very_high_correlation.append((corr.columns[i], corr.columns[j], corr.iloc[i,j]))


print(pd.DataFrame(very_high_correlation, columns=["Feat1", "Feat2", "pearson"]))
#corr.dropna(how='all').T.dropna(how='all').style.background_gradient('RdBu', vmax=1, vmin=-1)

         Feat1       Feat2   pearson
0       Drogas       Folio -0.719860
1        MMSEx        MMSE -0.935184
2    MMSEcodif        MMSE -0.728306
3    MMSEcodif       MMSEx  0.808425
4    BarthelAR    BarthelX -0.822434
5      NortonX      Norton -0.767800
6   NortonAlto      Norton  0.764823
7   NortonAlto     NortonX -0.881799
8      LawtonX      Lawton  0.932580
9     LawtonAR      Lawton -0.821173
10    LawtonAR     LawtonX -0.871083
11        MNAx         MNA -0.872407
12       MNAAR         MNA -0.765359
13       MNAAR        MNAx  0.881624
14         IMM          MM  0.908897
15  Creatinina        Urea  0.801113
16     Glucosa          Hb  0.792030
17     Glucosa        Urea  0.725648
18     Glucosa  Creatinina  0.799406
19       Sodio        Urea  0.727220


In [6]:
# drop encoding of continous features (temporary): Check with prof
drop_col.extend(['MMSEx', 'MMSEcodif', 'BarthelX', 'BarthelAR', 'NortonX', 'NortonAlto',
                 'LawtonX', 'LawtonAR', 'MNAx', 'MNAAR'])

# add features with high correlation to drop_col
for pair in very_high_correlation:
    if pair[0] in drop_col or pair[1] in drop_col:
        continue
    if pair[0] not in drop_col: drop_col.append(pair[0])

# drop unnecessary features
data = drop_feat(data, drop_col)

In [7]:
random.seed(1)

# generate training and testset
X = data.drop("Sarcopenia", axis=1)
y = data['Sarcopenia']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
# get 5 best features with chi2, f1 score
print("Best 5 features with:")
X_chi2_5 = SelectKBest(chi2, k=5).fit(X, y)
print("chi2:", X_chi2_5.get_feature_names_out(X.columns))
X_f1_5 = SelectKBest(f_classif, k=5).fit(X,y)
print("F1:", X_f1_5.get_feature_names_out(X.columns))


# get 10 best features with chi2, f1 score
print("Best 10 features with:")
X_chi2_10 = SelectKBest(chi2, k=10).fit(X, y)
print("chi2:", X_chi2_10.get_feature_names_out(X.columns))
X_f1_10 = SelectKBest(f_classif, k=10).fit(X,y)
print("F1:", X_f1_10.get_feature_names_out(X.columns))

# get 20 best features with chi2, f1 score
print("Best 20 features with:")
X_chi2_20 = SelectKBest(chi2, k=20).fit(X, y)
print("chi2:", X_chi2_20.get_feature_names_out(X.columns))
X_f1_20 = SelectKBest(f_classif, k=20).fit(X,y)
print("F1:", X_f1_20.get_feature_names_out(X.columns))

Best 5 features with:
chi2: ['MED3' 'MED4' 'MED5' 'MED9' 'MM']
F1: ['Ocupacion' 'TNCM' 'MM' 'Pantorrilla' 'Demencia']
Best 10 features with:
chi2: ['MED3' 'MED4' 'MED5' 'MED9' 'MM' 'FuerzaPrension' 'Pantorrilla'
 'Demencia' 'Hb' 'Urea']
F1: ['Edad' 'Letrado' 'EdoCivil' 'Ocupacion' 'Audiciòn' 'TNCM' 'MM'
 'Pantorrilla' 'Demencia' 'Congiciòn']
Best 20 features with:
chi2: ['Edad' 'Escolaridad' 'CorreccionVisual' 'CorreccionAuditiva' 'TNCM'
 'MED1' 'MED2' 'MED3' 'MED4' 'MED5' 'MED7' 'MED9' 'NùmeroDeCaìdas'
 'Lawton' 'MM' 'FuerzaPrension' 'Pantorrilla' 'Demencia' 'Hb' 'Urea']
F1: ['Edad' 'Letrado' 'EdoCivil' 'Ocupacion' 'CorreccionVisual' 'Audiciòn'
 'HAS' 'TNCM' 'Depresion' 'NùmeroDeCaìdas' 'Norton' 'Lawton' 'MNA'
 'Charlson' 'MM' 'FuerzaPrension' 'Pantorrilla' 'Demencia' 'Congiciòn'
 'Dolor']


In [22]:
# to do next:
# - hyperparameter tuning for RF, SVC, NN
# - afterwards recursive feature elimination

(250, 62)