In [1]:
import os
import random
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [2]:
#def set_seed(seed: int):
#    random.seed(seed) # Python
#    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
#    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo
#
#set_seed(25)

In [3]:
# tratamento da seed - pytorch
def set_seed(seed: int, deterministic_torch: bool = True):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo
    if torch is not None:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  #multi-GPU
        if deterministic_torch:
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

set_seed(25)

In [4]:
# Load dataset
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")

In [5]:
# Process the train dataset
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Apply MinMax scaling to float columns
float_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Apply PCA for feature reduction
X = radi.drop(columns=["Transition"])
y = radi["Transition"]

#pca = PCA(n_components=100)  # Reduzir para 100 componentes principais
pca = PCA(n_components=0.95)  # Manter 95% da variância
X_new = pca.fit_transform(X)
print(f"Explained variance ratio by the selected components: {pca.explained_variance_ratio_}")

# Atualizar o dataset com as features selecionadas por PCA
radi = pd.concat([pd.DataFrame(X_new), y.reset_index(drop=True)], axis=1)

# Save the full train dataset
radi.to_csv("train_full_prep8.csv", index=False)

# Split into 80% train and 20% test
radi_train, radi_val = train_test_split(radi, test_size=0.2, random_state=25, stratify=radi['Transition'])
radi_train.to_csv("train_split_prep8.csv", index=False)
radi_val.to_csv("test_split_prep8.csv", index=False)

# Process the test dataset
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_test.drop(columns=[col for col in radi_test.columns if radi_test[col].dtype == 'object' and col != 'Transition'], inplace=True)

# Apply MinMax scaling to the test dataset
radi_test[float_cols] = scaler.transform(radi_test[float_cols])  # Use the same scaler fit on the train dataset
radi_test.to_csv("test_processed_prep8.csv", index=False)

print("Datasets processed and saved:")
print("- train_full.csv")
print("- train_split.csv")
print("- test_split.csv")
print("- test_processed.csv")

Dropped 16 non-numeric columns.
Explained variance ratio by the selected components: [0.24427861 0.09200595 0.06917477 0.04761493 0.03808485 0.03116146
 0.02753408 0.02605698 0.02431753 0.02244226 0.01937008 0.01855864
 0.01747804 0.01597171 0.01394937 0.01312707 0.01223367 0.01148411
 0.0104861  0.00988995 0.00912912 0.00876786 0.00861957 0.00718737
 0.00631522 0.00611298 0.00549291 0.00543674 0.00520475 0.00497446
 0.00460528 0.00433789 0.00409521 0.00382897 0.00370376 0.00367592
 0.00354274 0.00330666 0.00319163 0.00310061 0.00294144 0.00286235
 0.00280893 0.00276648 0.00264579 0.00254348 0.00246812 0.00240065
 0.00225993 0.00223323 0.00220251 0.00218165 0.00210505 0.0020621
 0.00203664 0.00193069 0.00188494 0.00185776 0.00181575 0.00171035
 0.00169408 0.00162887 0.00161066 0.0015783  0.00154158 0.00150837
 0.00146784 0.00144462 0.00140627 0.00137553 0.00136166 0.00131372
 0.00129875 0.00123775 0.00122286 0.00118792 0.00115772 0.00113762
 0.00111194 0.0011047  0.00109086 0.00107131 

In [6]:
radi_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,78,79,80,81,82,83,84,85,86,Transition
121,-8.080423,1.276555,-2.334277,-0.833947,-1.182073,2.674713,-2.904571,-0.254947,0.590182,0.836974,...,-0.213215,0.866165,-0.593363,-0.267484,-0.098924,-0.128475,-0.08186,-0.086579,0.294547,AD-AD
198,-7.060199,0.315946,2.118766,-3.234942,1.258259,0.11614,0.437605,-0.930076,-0.006035,1.304745,...,0.272014,-0.177198,0.201195,0.074073,-0.209389,0.230735,0.100742,0.2033,0.196581,AD-AD
60,3.04589,3.10928,1.857784,2.620811,-0.620561,-1.246016,-0.340885,0.803081,2.365815,-0.089576,...,0.254726,-0.223767,-0.002233,-0.133534,-0.224957,0.427002,0.021854,-0.470093,0.047177,CN-CN
183,7.149649,-4.249797,-1.699928,0.859942,1.162599,0.258648,-2.540134,0.302058,0.611576,-0.318842,...,0.012654,-0.102776,0.704213,-1.051156,0.179303,-0.03941,0.251523,0.108722,0.45783,MCI-MCI
136,-1.119899,-1.14413,4.345195,-0.10208,-0.670097,-0.144158,-0.066341,2.179653,0.279401,-0.976311,...,-0.166623,0.469709,0.382697,-0.375414,0.09809,-0.099255,0.173318,-0.073045,0.145152,CN-CN


In [7]:
radi_test.head()

Unnamed: 0,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,...,lbp-3D-k_glszm_ZoneEntropy,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age
0,0.0,0.495054,0.0,0.07874,0.53679,0.0,0.47528,0.316034,0.373173,0.537721,...,0.594669,0.566023,0.280746,0.33531,0.296551,0.468272,0.325807,0.303302,1.0,0.829132
1,0.0,0.737522,0.0,0.401575,0.782512,0.0,0.634503,0.31288,0.307338,0.430676,...,0.489953,0.517394,0.468663,0.476645,0.175973,0.478687,0.350439,0.181719,1.0,0.478992
2,0.0,0.756433,0.0,0.259843,0.711157,0.0,0.571241,0.474246,0.453088,0.349322,...,0.424795,0.458065,0.458581,0.501734,0.17075,0.53937,0.419945,0.171175,1.0,0.456583
3,0.0,0.798346,0.0,0.291339,0.835729,0.0,0.50371,0.421881,0.488026,0.50594,...,0.414088,0.31117,0.64863,0.88364,0.032452,0.780213,0.719552,0.033423,1.0,0.59944
4,0.0,0.291812,0.0,0.377953,0.62423,0.0,0.663701,0.354026,0.312766,0.354822,...,0.599878,0.391127,0.449245,0.502824,0.178915,0.593427,0.470218,0.181981,1.0,0.431373
