In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [3]:
# Load datasets
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")


In [4]:

# Drop unique identifier columns from training data
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)
print("Dropped unique identifier columns from training dataset.")

# Drop non-numeric columns except for 'Transition'
columns_to_drop_train = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop_train, inplace=True)
print(f"Dropped {len(columns_to_drop_train)} non-numeric columns from training dataset.")

# Drop columns where all entries are the same
same_value_cols_train = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols_train, inplace=True)
print(f"Dropped {len(same_value_cols_train)} constant-value columns from training dataset.")

# Apply MinMax scaling to float columns in training data
float_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Save the full processed training dataset
radi.to_csv("train_full_prep3.csv", index=False)
print("Full processed training dataset saved as 'train_full.csv'.")

# Drop the same columns in the test dataset
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_test.drop(columns=columns_to_drop_train, inplace=True)  # Ignore if columns don't exist in test
radi_test.drop(columns=same_value_cols_train, inplace=True)  # Ignore if columns don't exist in test

# Apply MinMax scaling to float columns in the test data
radi_test[float_cols] = scaler.transform(radi_test[float_cols])  # Use the same scaler fit on radi

# Save the processed test dataset
radi_test.to_csv("test_processed_prep3.csv", index=False)
print("Processed test dataset saved as 'test_processed.csv'.")

# Step 3: Split the training dataset into train and validation
radi_train, radi_val = train_test_split(radi, test_size=0.2, random_state=25, stratify=radi['Transition'])

# Save the split datasets
radi_train.to_csv("train_split_prep3.csv", index=False)
radi_val.to_csv("test_split_prep3.csv", index=False)
print("Split datasets saved as 'train_split.csv' and 'test_split.csv'.")


Dropped unique identifier columns from training dataset.
Dropped 16 non-numeric columns from training dataset.
Dropped 148 constant-value columns from training dataset.
Full processed training dataset saved as 'train_full.csv'.
Processed test dataset saved as 'test_processed.csv'.
Split datasets saved as 'train_split.csv' and 'test_split.csv'.


In [5]:
radi_train.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
121,0.21362,0.314961,0.335558,0.508006,0.341425,0.340244,0.427038,0.436834,0.499171,0.675933,...,0.025589,0.601761,0.526909,0.221443,0.813066,0.758702,0.227083,0.0,0.560224,AD-AD
198,0.307972,0.433071,0.253251,0.848413,0.422372,0.28106,0.172539,0.232358,0.250244,0.566272,...,0.550955,0.128763,0.086476,0.731297,0.2773,0.175126,0.742073,0.0,0.481793,AD-AD
60,0.557907,0.637795,0.726899,0.266324,0.205972,0.222178,0.517104,0.619771,0.666109,0.574898,...,0.685323,0.332006,0.34512,0.267696,0.373086,0.246672,0.271671,1.0,0.577031,CN-CN
183,0.879452,0.031496,0.668891,0.726613,0.238604,0.165218,0.337588,0.363567,0.379887,0.821212,...,0.502064,0.396552,0.52326,0.165003,0.583905,0.464665,0.169135,1.0,0.210084,MCI-MCI
136,0.353786,0.519685,0.426762,0.638138,0.238391,0.180202,0.36545,0.371797,0.333667,0.469002,...,0.387723,0.343192,0.211267,0.453709,0.366279,0.240928,0.471903,1.0,0.708683,CN-CN


In [6]:
radi_test.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZoneEntropy,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age
0,0.495054,0.07874,0.53679,0.47528,0.316034,0.373173,0.537721,0.486557,0.469768,0.862239,...,0.594669,0.566023,0.280746,0.33531,0.296551,0.468272,0.325807,0.303302,1.0,0.829132
1,0.737522,0.401575,0.782512,0.634503,0.31288,0.307338,0.430676,0.514912,0.540905,0.875525,...,0.489953,0.517394,0.468663,0.476645,0.175973,0.478687,0.350439,0.181719,1.0,0.478992
2,0.756433,0.259843,0.711157,0.571241,0.474246,0.453088,0.349322,0.470691,0.629847,0.875525,...,0.424795,0.458065,0.458581,0.501734,0.17075,0.53937,0.419945,0.171175,1.0,0.456583
3,0.798346,0.291339,0.835729,0.50371,0.421881,0.488026,0.50594,0.672928,0.707844,1.03584,...,0.414088,0.31117,0.64863,0.88364,0.032452,0.780213,0.719552,0.033423,1.0,0.59944
4,0.291812,0.377953,0.62423,0.663701,0.354026,0.312766,0.354822,0.365214,0.250244,0.589231,...,0.599878,0.391127,0.449245,0.502824,0.178915,0.593427,0.470218,0.181981,1.0,0.431373


In [7]:
radi.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
0,0.468027,0.393701,0.598392,0.555087,0.455539,0.39272,0.28956,0.31062,0.25494,0.562431,...,0.184251,0.633393,0.646173,0.125643,0.735632,0.658619,0.127238,0.0,0.610644,CN-CN
1,0.281221,0.488189,0.713552,0.58897,0.397306,0.338081,0.311579,0.359443,0.335667,0.61962,...,0.651481,0.323882,0.458297,0.195642,0.492265,0.37379,0.198729,1.0,0.787115,CN-CN
2,0.76498,0.212598,0.530116,0.727218,0.876712,1.0,0.431089,0.622046,0.58396,0.932585,...,0.49244,0.258168,0.679235,0.134842,0.783349,0.776907,0.132002,1.0,0.728291,AD-AD
3,0.651078,0.338583,0.799452,0.547963,0.355153,0.372946,0.453707,0.55516,0.498358,0.760667,...,0.569706,0.433476,0.511718,0.154368,0.506896,0.375681,0.156912,1.0,0.680672,CN-MCI
4,0.382225,0.173228,0.406742,0.504096,0.080905,0.07806,0.545945,0.447539,0.424594,0.566272,...,0.450216,0.239867,0.441321,0.243251,0.680762,0.556234,0.249892,0.0,0.694678,CN-CN


In [8]:
# Load datasets
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_control = pd.read_csv("../sbsppdaa24/train_radiomics_occipital_CONTROL.csv")
radi_control.info()
radi.info()

FileNotFoundError: [Errno 2] No such file or directory: '../sbsppdaa24/train_radiomics_occipital_CONTROL.csv'

In [None]:
print(radi.columns)
print(radi_control.columns)

In [None]:
# Drop unique identifier columns from training data
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)
print("Dropped unique identifier columns from training dataset.")

# Drop non-numeric columns except for 'Transition'
columns_to_drop_train = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop_train, inplace=True)
print(f"Dropped {len(columns_to_drop_train)} non-numeric columns from training dataset.")

# Drop columns where all entries are the same
same_value_cols_train = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols_train, inplace=True)
print(f"Dropped {len(same_value_cols_train)} constant-value columns from training dataset.")

# Apply MinMax scaling to float columns in training data
float_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Save the full processed training dataset
# radi.to_csv("train_full_prep3.csv", index=False)
print("Full processed training dataset")

# Drop the same columns in the test dataset
radi_control.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_control.drop(columns=columns_to_drop_train, inplace=True)  # Ignore if columns don't exist in test
radi_control.drop(columns=same_value_cols_train, inplace=True)  # Ignore if columns don't exist in test

# Apply MinMax scaling to float columns in the control data
float_cols = radi.select_dtypes(include=['float', 'int']).columns
radi_control[float_cols] = scaler.transform(radi_control[float_cols])  # Use the same scaler fit on radi

# Save the processed test dataset
radi_control.to_csv("control_processed_prep3.csv", index=False)
print("Processed test dataset saved as 'control_processed.csv'.")



In [None]:
radi.head()

In [None]:
radi_control.head()

In [None]:
radi.info()

In [None]:
radi_control.info()