In [2]:
# load important modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold

%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load dataset
data = pd.DataFrame(pd.read_excel('data/sarcopenia.xlsx'))
# Genero 1: male, 2: female => change to 0: male, 1: female
# SI: 1, NO: 2 => change to NO: 0, YES: 1
data['Genero'] = data['Genero'].replace({1: 0, 2: 1})

In [4]:
# overview of data
print("Features: ", len(data.columns))
print("\"Misses\" in:\n")
print(data.loc[:,data.isna().any()].info())
print(data.columns)
data.describe()

Features:  84
"Misses" in:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Marcha  248 non-null    float64
 1   Hb      249 non-null    float64
dtypes: float64(2)
memory usage: 4.0 KB
None
Index(['Folio', 'Genero', 'Edad', 'Escolaridad', 'Letrado', 'EdoCivil',
       'Cuidador', 'Religiòn', 'Residencia', 'Ocupacion', 'Economìa',
       'Manutencion', 'Visiòn', 'CorreccionVisual', 'Audiciòn',
       'CorreccionAuditiva', 'HAS', 'DMII', 'OA', 'OSTEOP', 'GASTRITIS',
       'DEPRESION', 'CARDIOOLOGIA', 'TNCM', 'HIPOTIROIDISMO', 'HIPERTENSION',
       'CANCER', 'EPOC', 'DISLIPIDEMIA', 'IRC', 'InsfHepatica', 'MED1', 'MED2',
       'MED3', 'MED4', 'MED5', 'MED6', 'MED7', 'MED8', 'MED9', 'MED10',
       'Tabaquismo', 'Alcoholismo', 'Drogas', 'ExpBiomasa', 'MMSE', 'MMSEx',
       'MMSEcodif', 'GDS', 'Depresion', 'Barthel', 'BarthelX', 'BarthelAR',
       'Caìda

Unnamed: 0,Folio,Genero,Edad,Escolaridad,Letrado,EdoCivil,Cuidador,Religiòn,Residencia,Ocupacion,...,Congiciòn,EVC,Infecciòn,Dolor,Hb,Urea,Creatinina,Albumina,Glucosa,Sodio
count,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,...,250.0,250.0,250.0,250.0,249.0,250.0,250.0,250.0,250.0,250.0
mean,125.5,0.72,79.276,5.448,1.204,3.116,3.2,1.272,1.068,2.156,...,1.432,1.88,1.932,1.66,5.950602,17.93984,0.47376,0.98568,57.5324,53.6136
std,72.312977,0.4499,7.607319,4.484612,0.450774,1.181579,1.901173,0.748503,0.389851,0.852391,...,0.496348,0.325613,0.252251,0.474659,6.366781,24.421259,0.562175,1.740421,60.298818,68.075628
min,1.0,0.0,60.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,63.25,0.0,74.0,1.25,1.0,2.0,1.0,1.0,1.0,2.0,...,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,125.5,1.0,79.0,6.0,1.0,3.0,3.5,1.0,1.0,2.0,...,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,70.0,0.0
75%,187.75,1.0,85.0,9.0,1.0,4.0,5.0,1.0,1.0,2.0,...,2.0,2.0,2.0,2.0,12.5,35.0,0.8,0.0,100.75,139.0
max,250.0,1.0,97.0,16.0,3.0,5.0,6.0,5.0,5.0,4.0,...,2.0,2.0,2.0,2.0,16.3,118.0,2.4,5.4,199.0,148.7


In [5]:
# usefull functions and declaration of variables which will be used later
drop_col = []

# drop features stored as strings in feats from dataframe df
def drop_feat(df, feats):
    for feature in feats:
        if feature in df.columns: df.drop(feature, axis=1, inplace=True)
    return df

# get name of all binary features from dataframe df
def get_binary_features(df):
    return [feature for feature in df.columns 
     if len(data[feature].value_counts()) <= 2]

In [6]:
# Folio = ID: no information
if "Folio" not in drop_col: drop_col.append("Folio")

# get all binary features
binary_features = get_binary_features(data)
data[binary_features[1:]] = data[binary_features[1:]].replace({2:0})

#print("Binary features:\n", binary_features)

# apply variance threshold for binary
sel_bin = VarianceThreshold(threshold=(0.95*(1-0.95)))
sel_bin.fit_transform(data[binary_features])
mask = sel_bin.get_support()

for i in range(len(binary_features)):
        if not mask[i] and binary_features[i] not in drop_col: drop_col.append(binary_features[i])

print("Binary features with too low variance:\n", drop_col)


# get features which are not binary and have at least 95% of the same value
for i in data.columns:
    if (i not in binary_features) and (i not in drop_col) and (data[i].value_counts(dropna=False).tolist()[0] > 237.5):
        drop_col.append(i)

print("To drop because of low variance or entropy:\n", drop_col)

# Dependencia (1-4) ??
# Charlson (2-10) ??
# MM & IMM Meaning?
# FuerzaPrension => not binary
# Vision described for 1-3, but data show 1-5 (thoughts: encoding from -2 to 2)
# Audicion described for 1-3, but data show 1-5
# GDS (0-5): Meaning/Encoding unclear

# categorical features: 
# Escolaridad (0-16), Letrado (1-3), EdoCivil (1-5), Cuidador (1-6), 
# Religion (1-5), Residencia (1-5), Ocupacion(1-4), Economia (1-3),
# Manutencion (1-5), Vision (1-3/1-5), CorreccionVisual (0-5), 
# Audicion (1-3/1-4), CorreccionAuditiva (0-4), MED1, MED2, MED3, MED4,
# MED5, MED6, MED7, MED8, MED9, MED10, Tabaquismo (1-3), Alcoholismo (1-3)
# Drogas (1-3)

# continous feature, which are encoded as categoricals/binary: => delete 2 versions?
# MMSE/MMSEx/MMSEcodif, Barthel/BarthelX/BarthelAR, Norton/NortonX/NortonAlto, Lawton/LawtonX, MNA/MNAx


# LawtonAR: check relation to LawtonX, maybe some errors?
print(pd.crosstab(data['LawtonX'], data['LawtonAR'], margins='all'))
# MNAAR: check realtion to MNAx, maybe some errors?
print(pd.crosstab(data['MNAx'], data['MNAAR'], margins='all'))
print(str(data[(data['LawtonAR']==0) & (data['LawtonX'] == 3)].index)
      +"\n"+
      str(data[(data['LawtonAR']==1) & (data['LawtonX']==5)].index)
      +"\n"+
     str(data[(data['MNAAR']==0) & (data['MNAx'] == 3)].index))

Binary features with too low variance:
 ['Folio', 'GASTRITIS', 'CANCER', 'InsfHepatica', 'Ùlceras']
To drop because of low variance or entropy:
 ['Folio', 'GASTRITIS', 'CANCER', 'InsfHepatica', 'Ùlceras', 'Residencia', 'MED10']
LawtonAR    0    1  All
LawtonX                
1           0   31   31
2           0   31   31
3           1   38   39
4          51    0   51
5          96    2   98
All       148  102  250
MNAAR    0    1  All
MNAx                
1      114    0  114
2        0  120  120
3        2   14   16
All    116  134  250
Int64Index([118], dtype='int64')
Int64Index([36, 222], dtype='int64')
Int64Index([28, 214], dtype='int64')


In [10]:
# get correlation matrix
corr = data.corr('pearson')

# get features with strong relation in medicine/psychology
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6107969/
very_high_correlation = [] # 0.7 to 1.0
for i in range(len(corr)):
    for j in range(i):
        pearson_corr = np.abs(corr.iloc[i,j])
        if 0.7 <= pearson_corr:
            very_high_correlation.append((corr.columns[i], corr.columns[j], corr.iloc[i,j]))


print(pd.DataFrame(very_high_correlation, columns=["Feat1", "Feat2", "pearson"]))
#corr.dropna(how='all').T.dropna(how='all').style.background_gradient('RdBu', vmax=1, vmin=-1)

         Feat1       Feat2   pearson
0       Drogas       Folio -0.719860
1        MMSEx        MMSE -0.935184
2    MMSEcodif        MMSE -0.728306
3    MMSEcodif       MMSEx  0.808425
4    BarthelAR    BarthelX -0.822434
5      NortonX      Norton -0.767800
6   NortonAlto      Norton  0.764823
7   NortonAlto     NortonX -0.881799
8      LawtonX      Lawton  0.932580
9     LawtonAR      Lawton -0.821173
10    LawtonAR     LawtonX -0.871083
11        MNAx         MNA -0.872407
12       MNAAR         MNA -0.765359
13       MNAAR        MNAx  0.881624
14         IMM          MM  0.908897
15  Creatinina        Urea  0.801113
16     Glucosa          Hb  0.791283
17     Glucosa        Urea  0.725648
18     Glucosa  Creatinina  0.799406
19       Sodio        Urea  0.727220
