# Imports, constants and functions

In [None]:
import os
import nrrd
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import radiomics as pr
import SimpleITK as sitk
import matplotlib.pyplot as plt
from radiomics import featureextractor
from sklearn.linear_model import lasso_path
from sklearn.ensemble import RandomForestRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Constantes
INPUT_PATH = 'inputs'
IMAGES_PATH = 'images'

In [None]:
def transform_dataframe(df):
    """
    Transforma un DataFrame con celdas de diferentes clases a numeros o strings.
    
    Para columnas que contienen listas o tuplas:
    - Si la lista o tupla tiene un único elemento, convierte el valor en un número o string según corresponda.
    - Si la lista o tupla tiene múltiples elementos, expande la columna en varias columnas, 
      una por cada elemento de la lista o tupla. Las nuevas columnas se nombran usando el nombre 
      original seguido por un sufijo `_1`, `_2`, etc.

    Para columnas que contienen diccionarios:
    - Cada clave del diccionario se convierte en una nueva columna.
    - Si un valor del diccionario es un array/lista o tupla:
        - Si tiene un único elemento, se convierte en un valor único.
        - Si tiene múltiples elementos, genera columnas adicionales con sufijos `_1`, `_2`, etc.
    - Las nuevas columnas se nombran usando el nombre original seguido por `_{key}` y, si es necesario, 
      un sufijo adicional para los arrays o tuplas.
    - Elimina la columna original una vez procesada.

    Args:
        df (pd.DataFrame): DataFrame original.
    Returns:
        df (pd.DataFrame): DataFrame transformado.
    """
    # Crear una copia para no modificar el original
    transformed_df = df.copy()

    # Iterar sobre las columnas
    for col in transformed_df.columns:
        # Identificar las celdas que son listas, tuplas o arrays
        if transformed_df[col].apply(lambda x: isinstance(x, (list, tuple))).any():
            # Expandir los valores si hay listas/tuplas con más de un elemento
            expanded = transformed_df[col].apply(lambda x: list(x) if isinstance(x, (list, tuple)) else [x])
            
            # Verificar la longitud máxima de las listas/tuplas
            max_len = expanded.apply(len).max()
            
            if max_len > 1:
                # Crear nuevas columnas para listas/tuplas con múltiples elementos
                for i in range(max_len):
                    transformed_df[f"{col}_{i+1}"] = expanded.apply(lambda x: x[i] if i < len(x) else None)
                
                # Eliminar la columna original
                transformed_df.drop(columns=[col], inplace=True)
            else:
                # Convertir listas/tuplas con un único elemento en valores (número o string)
                transformed_df[col] = expanded.apply(lambda x: x[0] if len(x) == 1 else x)
        
        # Identificar las celdas que son diccionarios
        elif transformed_df[col].apply(lambda x: isinstance(x, dict)).any():
            # Expandir las claves del diccionario en nuevas columnas
            dict_expansion = transformed_df[col].apply(lambda x: x if isinstance(x, dict) else {})
            keys = set(k for d in dict_expansion for k in d.keys())
            
            for key in keys:
                # Extraer los valores de la clave específica
                key_values = dict_expansion.apply(lambda x: x.get(key, None))
                
                # Si los valores son arrays, listas o tuplas, manejarlos como tal
                if key_values.apply(lambda x: isinstance(x, (list, tuple))).any():
                    # Expandir los arrays/tuplas en columnas adicionales
                    expanded = key_values.apply(lambda x: list(x) if isinstance(x, (list, tuple)) else [x])
                    max_len = expanded.apply(len).max()
                    
                    for i in range(max_len):
                        transformed_df[f"{col}_{key}_{i+1}"] = expanded.apply(lambda x: x[i] if i < len(x) else None)
                else:
                    # Si no son listas/tuplas, mantener el valor tal cual
                    transformed_df[f"{col}_{key}"] = key_values
            
            # Eliminar la columna original
            transformed_df.drop(columns=[col], inplace=True)

    return transformed_df

In [37]:
def same_sizes(image1, image2):
    """
    Comprueba que la imagen 1 y la imagen 1 tienen las mismas dimensiones.
    Args:
        image1 (SimpleITK.Image): Imagen 1.
        image2 (SimpleITK.Image): Imagen 2.
    Returns:
        boolean: Si el tamaño coincide
    """
    return image1.GetSize() == image2.GetSize()

In [None]:
def extract_features(image_file_path, mask_file_path):
    """
    Extrae características radiómicas de una imagen y su máscara utilizando PyRadiomics.
    Args:
        image_file_path (str): Ruta al archivo NRRD que contiene la imagen a analizar.
        mask_file_path (str): Ruta al archivo NRRD que contiene la máscara asociada a la imagen.
    Returns:
        features (dict): Características radiómicas extraídas.
    Extra:
        Comprueba que el tamaño de las imagenes sea compatible.
    """
    # Cargar la imagen desde el archivo NRRD
    image_data, _ = nrrd.read(image_file_path)
    image = sitk.GetImageFromArray(image_data)
    
    # Cargar la máscara desde el archivo NRRD
    mask_data, _ = nrrd.read(mask_file_path)
    mask = sitk.GetImageFromArray(mask_data)
    
    # Crear un extractor de características de PyRadiomics
    if same_sizes(image, mask):
        extractor = featureextractor.RadiomicsFeatureExtractor()
    else:
        print(f'[ERROR] Sizes are not the same for {image_file_path} and {mask_file_path}')
    
    # Extraer las características radiómicas
    features = extractor.execute(image, mask)
    
    return features

In [None]:
def convert_columns_to_numeric(df):
    """
    Intenta convertir todas las columnas de un DataFrame a valores numéricos.
    
    Args:
        df (pd.DataFrame): DataFrame original.
    Returns:
        df (pd.DataFrame): DataFrame transformado.
    """
    # Crear una copia del DataFrame para no modificar el original
    numeric_df = df.copy()
    
    for col in numeric_df.columns:
        try:
            # Intentar convertir la columna a valores numéricos
            numeric_df[col] = pd.to_numeric(numeric_df[col], errors='raise')
        except Exception as e:
            # Imprimir un mensaje de error y eliminar la columna si falla
            print(f'[ERROR] al convertir la columna {col} a número. {e}')
            numeric_df.drop(columns=[col], inplace=True)
    
    return numeric_df


In [None]:
def plot_correlation_matrix(df):
    """
    Genera una matriz de correlación con un mapa de calor.
    
    Args:
        df (DataFrame): DataFrame con los datos.
    """
    plt.figure(figsize=(10, 8))
    correlation_matrix = df.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Matriz de Correlación")
    plt.savefig(f'{IMAGES_PATH}/correlation_matrix.png')
    plt.show()

In [None]:
def plot_lasso_path(X, y):
    """
    Genera un gráfico del camino de Lasso para analizar la importancia de las variables.
    
    Args:
        X (array-like): Variables independientes.
        y (array-like): Variable dependiente.
    """
    alphas, coefs, _ = lasso_path(X, y)
    plt.figure(figsize=(10, 6))
    for coef in coefs:
        plt.plot(-np.log10(alphas), coef)
    plt.xlabel("-Log10(Alpha)")
    plt.ylabel("Coeficientes")
    plt.title("Camino de Lasso (Lasso Path)")
    plt.savefig(f'{IMAGES_PATH}/lasso_path.png')
    plt.show()

In [None]:
def plot_variable_distribution(df):
    """
    Genera gráficos de distribución para todas las columnas numéricas.
    
    Args:
        df (DataFrame): DataFrame con los datos.
    """
    numeric_columns = df.select_dtypes(include=['number']).columns
    for column in numeric_columns:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[column], kde=True, bins=30)
        plt.title(f"Distribución de {column}")
        plt.xlabel(column)
        plt.ylabel("Frecuencia")
        plt.show()

In [None]:
def detect_missing_values(df):
    """
    Genera un reporte visual y tabular de valores faltantes.
    
    Args:
        df (DataFrame): DataFrame con los datos.
    """
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    missing_report = pd.DataFrame({
        'Valores Faltantes': missing_values,
        'Porcentaje (%)': missing_percentage
    }).sort_values(by='Valores Faltantes', ascending=False)
    print(missing_report)

    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
    plt.title("Mapa de Valores Faltantes")
    plt.savefig(f'{IMAGES_PATH}/missing_values.png')
    plt.show()

In [None]:
def plot_feature_importance(X, y, feature_names):
    """
    Genera un gráfico de importancia de características usando un modelo de Random Forest.
    
    Args:
        X (array-like): Variables independientes.
        y (array-like): Variable dependiente.
        feature_names (list): Nombres de las características.
    """
    model = RandomForestRegressor(random_state=0)
    model.fit(X, y)
    importance = model.feature_importances_

    plt.figure(figsize=(10, 6))
    plt.barh(feature_names, importance)
    plt.xlabel("Importancia")
    plt.ylabel("Características")
    plt.title("Importancia de las Características")
    plt.savefig(f'{IMAGES_PATH}/feature_importance_RF.png')
    plt.show()

In [None]:
def plot_outliers(df):
    """
    Genera boxplots para identificar outliers en las columnas numéricas.
    
    Args:
        df (DataFrame): DataFrame con los datos.
    """
    numeric_columns = df.select_dtypes(include=['number']).columns
    for column in numeric_columns:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=df[column])
        plt.title(f"Outliers en {column}")
        plt.show()

In [None]:
def target_correlation_analysis(df, target):
    """
    Analiza la correlación entre las variables independientes y la variable objetivo.
    
    Args:
        df (DataFrame): DataFrame con los datos.
        target (str): Nombre de la columna objetivo.
    """
    correlation = df.corr()[target].sort_values(ascending=False)
    print(f'Correlación con la variable objetivo:{correlation}')

In [None]:
def calculate_vif(X):
    """
    Calcula el Factor de Inflación de la Varianza (VIF) para detectar multicolinealidad.
    
    Args:
        X (DataFrame): Variables independientes.
    """
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    print(vif_data)

# Load data

In [39]:
df_features = pd.DataFrame()

for num in range(1,20):
    nrrd_path = os.path.join(INPUT_PATH, 'serie'+str(num)+'oc.nrrd')
    mask_path = os.path.join(INPUT_PATH, 'oc'+str(num)+'.nrrd')
    if os.path.exists(nrrd_path) and os.path.exists(mask_path):
        features_image = extract_features(nrrd_path, mask_path)
        df_features = pd.concat([df_features, pd.DataFrame([features_image])], ignore_index=True)
    else:
        print(f'ERROR: No such file for {nrrd_path} or {mask_path}')

ERROR: No such file for inputs/serie1oc.nrrd or inputs/oc1.nrrd


GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated


ERROR: No such file for inputs/serie3oc.nrrd or inputs/oc3.nrrd


GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated


ERROR: No such file for inputs/serie5oc.nrrd or inputs/oc5.nrrd
ERROR: No such file for inputs/serie6oc.nrrd or inputs/oc6.nrrd


GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated


ERROR: No such file for inputs/serie11oc.nrrd or inputs/oc11.nrrd


GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated


ERROR: No such file for inputs/serie13oc.nrrd or inputs/oc13.nrrd


GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated


ERROR: No such file for inputs/serie17oc.nrrd or inputs/oc17.nrrd
ERROR: No such file for inputs/serie18oc.nrrd or inputs/oc18.nrrd
ERROR: No such file for inputs/serie19oc.nrrd or inputs/oc19.nrrd


In [40]:
df_features.shape

(10, 222)

In [41]:
df_features.head()

Unnamed: 0,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,diagnostics_Image-original_Hash,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Spacing,...,original_ngtdm 2_Busyness,original_ngtdm 2_Coarseness,original_ngtdm 2_Complexity,original_ngtdm 2_Contrast,original_ngtdm 2_Strength,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},22e18047512aa52054d6739e261bd24e91937a10,3D,"(1.0, 1.0, 1.0)",...,5.720043549154863,0.0032417680604629,12.774190506780394,0.0238380237588044,0.0821792813711147,5.720043549154863,0.0032417680604629,12.774190506780394,0.0238380237588044,0.0821792813711147
1,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},c9bc35ba35cba278e2c3330b7ee7c2aae51bab1b,3D,"(1.0, 1.0, 1.0)",...,13.090148638950009,0.0031384903950389,8.219150136266098,0.0269409068129809,0.0357201179658529,13.090148638950009,0.0031384903950389,8.219150136266098,0.0269409068129809,0.0357201179658529
2,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},fa17a959aa9c16301b5a3802f0efcb4a1507fcd7,3D,"(1.0, 1.0, 1.0)",...,19.66226372818894,0.0008193834471981,12.417841300737862,0.0203713324136059,0.0246423821268632,19.66226372818894,0.0008193834471981,12.417841300737862,0.0203713324136059,0.0246423821268632
3,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},902b4ab29cafefcfac9792ad060f554caf7e2be6,3D,"(1.0, 1.0, 1.0)",...,46.88294030617693,0.0001502189254201,46.274894047530175,0.0130009806759264,0.0119277407340604,46.88294030617693,0.0001502189254201,46.274894047530175,0.0130009806759264,0.0119277407340604
4,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},e9abdb34b8f8392eb01b7983aecd88f3b8d9df72,3D,"(1.0, 1.0, 1.0)",...,44.708241714308144,0.0002287296394863,32.51650755672889,0.0103716536529255,0.0126929023618077,44.708241714308144,0.0002287296394863,32.51650755672889,0.0103716536529255,0.0126929023618077


# Clean dataframe

In [52]:
df_clean = transform_dataframe(df_features)

In [53]:
df_clean.shape

(10, 253)

In [54]:
df_clean.head()

Unnamed: 0,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Image-original_Hash,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,...,diagnostics_Mask-original_BoundingBox_3,diagnostics_Mask-original_BoundingBox_4,diagnostics_Mask-original_BoundingBox_5,diagnostics_Mask-original_BoundingBox_6,diagnostics_Mask-original_CenterOfMassIndex_1,diagnostics_Mask-original_CenterOfMassIndex_2,diagnostics_Mask-original_CenterOfMassIndex_3,diagnostics_Mask-original_CenterOfMass_1,diagnostics_Mask-original_CenterOfMass_2,diagnostics_Mask-original_CenterOfMass_3
0,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,22e18047512aa52054d6739e261bd24e91937a10,3D,-507.554195,-1024.0,1467.0,...,373,14,23,22,132.583526,273.373139,383.563348,132.583526,273.373139,383.563348
1,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,c9bc35ba35cba278e2c3330b7ee7c2aae51bab1b,3D,-569.965574,-1024.0,2976.0,...,140,11,19,19,209.714222,325.006261,149.017442,209.714222,325.006261,149.017442
2,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,fa17a959aa9c16301b5a3802f0efcb4a1507fcd7,3D,-663.913123,-1024.0,1420.0,...,159,20,33,35,191.695901,265.530162,176.317618,191.695901,265.530162,176.317618
3,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,902b4ab29cafefcfac9792ad060f554caf7e2be6,3D,-644.35275,-1024.0,1798.0,...,195,37,61,59,50.923152,258.691036,221.965796,50.923152,258.691036,221.965796
4,v3.0.1,1.26.2,2.4.0,1.6.0,3.9.13,e9abdb34b8f8392eb01b7983aecd88f3b8d9df72,3D,-689.992377,-1024.0,2589.0,...,132,16,62,63,12.293919,303.773773,163.999783,12.293919,303.773773,163.999783


In [56]:
df_numeric = convert_columns_to_numeric(df_clean)

Error al convertir la columna 'diagnostics_Versions_PyRadiomics' a valores numéricos: Unable to parse string "v3.0.1" at position 0
Error al convertir la columna 'diagnostics_Versions_Numpy' a valores numéricos: Unable to parse string "1.26.2" at position 0
Error al convertir la columna 'diagnostics_Versions_SimpleITK' a valores numéricos: Unable to parse string "2.4.0" at position 0
Error al convertir la columna 'diagnostics_Versions_PyWavelet' a valores numéricos: Unable to parse string "1.6.0" at position 0
Error al convertir la columna 'diagnostics_Versions_Python' a valores numéricos: Unable to parse string "3.9.13" at position 0
Error al convertir la columna 'diagnostics_Image-original_Hash' a valores numéricos: Unable to parse string "22e18047512aa52054d6739e261bd24e91937a10" at position 0
Error al convertir la columna 'diagnostics_Image-original_Dimensionality' a valores numéricos: Unable to parse string "3D" at position 0
Error al convertir la columna 'diagnostics_Mask-origina

In [57]:
df_numeric.shape

(10, 50)

In [58]:
df_numeric.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_MinorAxisLength,...,diagnostics_Mask-original_BoundingBox_3,diagnostics_Mask-original_BoundingBox_4,diagnostics_Mask-original_BoundingBox_5,diagnostics_Mask-original_BoundingBox_6,diagnostics_Mask-original_CenterOfMassIndex_1,diagnostics_Mask-original_CenterOfMassIndex_2,diagnostics_Mask-original_CenterOfMassIndex_3,diagnostics_Mask-original_CenterOfMass_1,diagnostics_Mask-original_CenterOfMass_2,diagnostics_Mask-original_CenterOfMass_3
0,-507.554195,-1024.0,1467.0,3023,1,0.843453,0.553572,11.544899,20.855289,17.590459,...,373,14,23,22,132.583526,273.373139,383.563348,132.583526,273.373139,383.563348
1,-569.965574,-1024.0,2976.0,2236,1,0.946245,0.583899,10.407151,17.82354,16.865433,...,140,11,19,19,209.714222,325.006261,149.017442,209.714222,325.006261,149.017442
2,-663.913123,-1024.0,1420.0,11687,1,0.896497,0.57129,18.025287,31.551924,28.286212,...,159,20,33,35,191.695901,265.530162,176.317618,191.695901,265.530162,176.317618
3,-644.35275,-1024.0,1798.0,60431,1,0.988491,0.540658,29.264449,54.1275,53.504536,...,195,37,61,59,50.923152,258.691036,221.965796,50.923152,258.691036,221.965796
4,-689.992377,-1024.0,2589.0,32264,1,0.951487,0.231236,13.582543,58.73885,55.889265,...,132,16,62,63,12.293919,303.773773,163.999783,12.293919,303.773773,163.999783


# Look dataframe