# MQI Feature Selection

In [1]:
"""Module for Baseline Feature Selection of MQI dataset."""

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import functions_features as fn

# Cargar Datos

In [2]:
# Datos puros
datos_puros = pd.read_csv("../data/datos_puros.csv")

# Datos modificados
datos_modificados = pd.read_csv("../data/datos_modificados.csv")

print(datos_puros.shape)
print(datos_modificados.shape)

(1239, 28)
(1253, 28)


# Crear MQI subconjunto

In [3]:
datos_puros['cut_off_points'].unique()

array([3, 2, 1], dtype=int64)

In [4]:
# Crear subconjunto de datos por MQI, sin variables dependientes

# Variable dependiente
Y_mqi_pur = datos_puros['cut_off_points']
Y_mqi_mod = datos_modificados['cut_off_points']

# List of the true labels
labels_true = Y_mqi_pur.unique()

# Columns to drop; related to target value or nominal
mqi_dep_columnas = ['mqi', # cut_off_points está calculado de mqi
                    'músculo_relativ', 'hand_drch', # used to calculate mqi
                    'sarcopenia_handgrip', # based on hand_drch
                    'sarcewgsop', # related
                    'sexo',  # nominal
                    'altura', 'músculo',
                    'pnts_eq'
                    ]

mqi_datos_puros = datos_puros.drop(mqi_dep_columnas, axis=1) # datos puros
mqi_datos_puros_nhi = mqi_datos_puros.drop(['hand_izq'], axis=1)

mqi_datos_modificados = datos_modificados.drop(mqi_dep_columnas, axis=1) # datos modificados
mqi_datos_modificados_nhi = mqi_datos_modificados.drop(['hand_izq'], axis=1)

print(mqi_datos_puros.shape, mqi_datos_modificados.shape)
print(mqi_datos_puros_nhi.shape, mqi_datos_modificados_nhi.shape)

(1239, 19) (1253, 19)
(1239, 18) (1253, 18)


# Preprocessing Nominal and Ordinal Data

## Nominal Data with One-Hot Encoding
To prevent biasing of nominal data.

sarcopenia_v: 1=sí (mal) , 0=no (bien)

In [6]:
print(f"Possible values in column 'sarcopenia_v': {mqi_datos_puros.sarcopenia_v.unique()}")

Possible values in column 'sarcopenia_v': [1 0]


In [7]:
# Nominal data - one-hot encoding - sarcopenia_v
enc = OneHotEncoder()

transformed = enc.fit_transform(mqi_datos_puros[['sarcopenia_v']]).toarray()
ohe_df = pd.DataFrame(transformed, columns=['sarcopenia_v_0', 'sarcopenia_v_1'])

# Add one-hot encoded data to df and drop original column
mqi_datos_puros = pd.concat([mqi_datos_puros, ohe_df], axis=1).drop(['sarcopenia_v'], axis=1)

In [8]:
# Datos Modificados

# Nominal data - one-hot encoding - sarcopenia_v
enc = OneHotEncoder()

transformed = enc.fit_transform(mqi_datos_modificados[['sarcopenia_v']]).toarray()
ohe_df = pd.DataFrame(transformed, columns=['sarcopenia_v_0', 'sarcopenia_v_1'])

# Add one-hot encoded data to df and drop original column
mqi_datos_modificados = pd.concat([mqi_datos_modificados, ohe_df], axis=1)\
                        .drop(['sarcopenia_v'], axis=1)

## Ordinal Data
Scaled to ensure equal weights and make comparison between values easier.

In [None]:
def scale_the_ordinal(dataset):
    """Function for scaling ordinal variables."""
    # ordinal data - change scales to match (0-10) - id_3, fragilidad, gruposbuena
    columns_to_scale = ['id_3', 'fragilidad', 'gruposbuena']

    print("---- Change Applied ----")
    # Instantiate scaler
    scaler = MinMaxScaler(feature_range=(0,10))

    # Fit and transform columns
    d = scaler.fit_transform(dataset[columns_to_scale])

    # Cast as DataFrame and round
    dataset[columns_to_scale] = pd.DataFrame(d, columns=columns_to_scale).round(2)

    return dataset

In [None]:
# DATOS PUROS
mqi_datos_puros = scale_the_ordinal(mqi_datos_puros)

# DATOS MODIFICADOS
mqi_datos_modificados = scale_the_ordinal(mqi_datos_modificados)

---- Change Applied ----
---- Change Applied ----
---- Change Applied ----


In [None]:
# Save MQI subsets
mqi_datos_puros.to_csv('../data/mqi/puros/mqi_datos_puros.csv')
mqi_datos_modificados.to_csv('../data/mqi/modificados/mqi_datos_modificados.csv')

In [None]:
# Remove cut_off_points to create training data subset
X_mqi_datos_pur = mqi_datos_puros.drop('cut_off_points', axis=1)
X_mqi_datos_mod = mqi_datos_modificados.drop('cut_off_points', axis=1)

# Remove hand_izq to create training data subset
X_mqi_datos_pur_nhi = X_mqi_datos_pur.drop('hand_izq', axis=1)
X_mqi_datos_mod_nhi = X_mqi_datos_mod.drop('hand_izq', axis=1)

# Save subsets
X_mqi_datos_pur_nhi.to_csv('../data/mqi/puros/X_mqi_datos_pur.csv')
X_mqi_datos_mod_nhi.to_csv('../data/mqi/modificados/X_mqi_datos_mod.csv')

attributes_nhi = list(X_mqi_datos_pur_nhi.columns)
len(attributes_nhi)

18

## Estandarizar los datos
To prepare data for:
- variance-based feature selection (did not end up using these features for clustering)
- agglomerative clustering
- SVM feature selection

**Note**: Before applying hierarchical clustering (such as **agglomerative**), you should scale and normalize the data to ensure that all the variables have the same range and importance. Scaling and normalizing the data can help reduce the influence of outliers and extreme values, and improve the accuracy and consistency of the distance or similarity measures. [Source](https://www.linkedin.com/advice/3/what-some-best-practices-tips-hierarchical-clustering)

In [None]:
# DATOS SIN HAND_IZA
X_mqi_norm_pur_nhi = fn.standardize_yo(X_mqi_datos_pur_nhi,
                                    'mqi/puros/X_mqi_norm_pur.csv',
                                    attributes_nhi)
X_mqi_norm_mod_nhi = fn.standardize_yo(X_mqi_datos_mod_nhi,
                                    'mqi/modificados/X_mqi_norm_mod.csv',
                                    attributes_nhi)