In [1]:
import os
import random
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

In [2]:
#def set_seed(seed: int):
#    random.seed(seed) # Python
#    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
#    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo
#
#set_seed(25)

In [3]:
# tratamento da seed - pytorch
def set_seed(seed: int, deterministic_torch: bool = True):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo
    if torch is not None:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  #multi-GPU
        if deterministic_torch:
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

set_seed(25)

In [4]:
# Load dataset
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")

In [5]:
# Process the train dataset
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Apply MinMax scaling to float columns
float_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Feature selection using ANOVA 
X = radi.drop(columns=["Transition"])
y = radi["Transition"]
selector = SelectKBest(score_func=f_classif, k=100)  # Selecionar as 100 melhores features
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print(f"Selected features: {selected_features.tolist()}")

# Atualizar o dataset com as features selecionadas
radi = pd.concat([pd.DataFrame(X_new, columns=selected_features), y.reset_index(drop=True)], axis=1)

# Save the full train dataset
radi.to_csv("train_full_prep7.csv", index=False)

# Split into 80% train and 20% test
radi_train, radi_val = train_test_split(radi, test_size=0.2, random_state=25,stratify=radi['Transition'])
radi_train.to_csv("train_split_prep7.csv", index=False)
radi_val.to_csv("test_split_prep7.csv", index=False)

# Process the test dataset
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_test.drop(columns=[col for col in radi_test.columns if radi_test[col].dtype == 'object' and col != 'Transition'], inplace=True)

# Apply MinMax scaling to the test dataset
radi_test[float_cols] = scaler.transform(radi_test[float_cols])  # Use the same scaler fit on the train dataset
radi_test.to_csv("test_processed_prep7.csv", index=False)

print("Datasets processed and saved:")
print("- train_full.csv")
print("- train_split.csv")
print("- test_split.csv")
print("- test_processed.csv")


Dropped 16 non-numeric columns.


 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824
 1825 1826 1827 1828 1834 1835 1839 1844 1845 1846 1850 1859 1860 1861
 1862 1866 1867 1868 1872 1875 1876 1877 1878 1879 1883 1886 1890 1891
 1896 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910
 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1927 1928 1932
 1937 1938 1939 1943 1952 1953 1954 1955 1959 1960 1961 1965 1968 1969
 1970 1971 1972 1976 1983 1989 1991 1992 1993 1994 1995 1996 1997 1998
 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
 2013 2014 2020 2021 2025 2030 2031 2032 2036 2045 2046 2047 2048 2052
 2053 2054 2058 2061 2062 2063 2064 2065] are constant.
  f = msb / msw


Selected features: ['original_shape_SurfaceVolumeRatio', 'original_firstorder_Energy', 'original_firstorder_TotalEnergy', 'original_gldm_SmallDependenceEmphasis', 'original_glszm_LargeAreaEmphasis', 'original_glszm_ZoneVariance', 'wavelet-LHL_ngtdm_Coarseness', 'wavelet-LHH_gldm_DependenceEntropy', 'wavelet-LHH_gldm_DependenceNonUniformity', 'wavelet-LHH_gldm_DependenceNonUniformityNormalized', 'wavelet-LHH_gldm_DependenceVariance', 'wavelet-LHH_gldm_SmallDependenceEmphasis', 'wavelet-HLH_gldm_DependenceEntropy', 'wavelet-HLH_gldm_DependenceNonUniformityNormalized', 'wavelet-HLH_gldm_DependenceVariance', 'wavelet-HLH_gldm_SmallDependenceEmphasis', 'wavelet-HHH_gldm_DependenceNonUniformityNormalized', 'wavelet-HHH_gldm_SmallDependenceEmphasis', 'wavelet-HHH_gldm_SmallDependenceHighGrayLevelEmphasis', 'wavelet-HHH_gldm_SmallDependenceLowGrayLevelEmphasis', 'wavelet-HHH_glrlm_RunLengthNonUniformityNormalized', 'wavelet-HHH_glrlm_ShortRunEmphasis', 'wavelet-LLL_firstorder_Energy', 'wavelet

In [6]:
radi_train.head()

Unnamed: 0,original_shape_SurfaceVolumeRatio,original_firstorder_Energy,original_firstorder_TotalEnergy,original_gldm_SmallDependenceEmphasis,original_glszm_LargeAreaEmphasis,original_glszm_ZoneVariance,wavelet-LHL_ngtdm_Coarseness,wavelet-LHH_gldm_DependenceEntropy,wavelet-LHH_gldm_DependenceNonUniformity,wavelet-LHH_gldm_DependenceNonUniformityNormalized,...,lbp-3D-m2_gldm_SmallDependenceLowGrayLevelEmphasis,lbp-3D-m2_glrlm_LongRunEmphasis,lbp-3D-m2_glrlm_LongRunHighGrayLevelEmphasis,lbp-3D-m2_glrlm_LongRunLowGrayLevelEmphasis,lbp-3D-m2_glrlm_RunLengthNonUniformityNormalized,lbp-3D-m2_glrlm_RunPercentage,lbp-3D-m2_glrlm_ShortRunEmphasis,lbp-3D-m2_glrlm_ShortRunHighGrayLevelEmphasis,lbp-3D-m2_glrlm_ShortRunLowGrayLevelEmphasis,Transition
121,0.646722,0.332975,0.332975,0.743187,0.16565,0.167302,0.57782,0.911714,0.259788,0.135969,...,0.528112,0.183653,0.183653,0.183653,0.633024,0.674847,0.658862,0.658862,0.658862,AD-AD
198,0.646124,0.279352,0.279352,0.605184,0.155027,0.152532,0.572456,1.0,0.180384,0.024628,...,0.64087,0.216355,0.216355,0.216355,0.627178,0.663384,0.71096,0.71096,0.71096,AD-AD
60,0.142776,0.710948,0.710948,0.226206,0.453043,0.461957,0.154716,0.231668,0.715294,0.761206,...,0.13245,0.902781,0.902781,0.902781,0.09802,0.161192,0.249003,0.249003,0.249003,CN-CN
183,0.189767,0.634511,0.634511,0.164879,0.743082,0.74857,0.135693,0.124846,0.68103,0.818409,...,0.062202,0.667708,0.667708,0.667708,0.144705,0.201226,0.055939,0.055939,0.055939,MCI-MCI
136,0.464281,0.39072,0.39072,0.475275,0.233301,0.237776,0.410224,0.815456,0.342529,0.214382,...,0.39056,0.455996,0.455996,0.455996,0.414524,0.473195,0.514465,0.514465,0.514465,CN-CN


In [7]:
radi_test.head()

Unnamed: 0,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,...,lbp-3D-k_glszm_ZoneEntropy,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age
0,0.0,0.495054,0.0,0.07874,0.53679,0.0,0.47528,0.316034,0.373173,0.537721,...,0.594669,0.566023,0.280746,0.33531,0.296551,0.468272,0.325807,0.303302,1.0,0.829132
1,0.0,0.737522,0.0,0.401575,0.782512,0.0,0.634503,0.31288,0.307338,0.430676,...,0.489953,0.517394,0.468663,0.476645,0.175973,0.478687,0.350439,0.181719,1.0,0.478992
2,0.0,0.756433,0.0,0.259843,0.711157,0.0,0.571241,0.474246,0.453088,0.349322,...,0.424795,0.458065,0.458581,0.501734,0.17075,0.53937,0.419945,0.171175,1.0,0.456583
3,0.0,0.798346,0.0,0.291339,0.835729,0.0,0.50371,0.421881,0.488026,0.50594,...,0.414088,0.31117,0.64863,0.88364,0.032452,0.780213,0.719552,0.033423,1.0,0.59944
4,0.0,0.291812,0.0,0.377953,0.62423,0.0,0.663701,0.354026,0.312766,0.354822,...,0.599878,0.391127,0.449245,0.502824,0.178915,0.593427,0.470218,0.181981,1.0,0.431373
