In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [3]:
# Load dataset
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")

Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64


In [4]:
# Process the train dataset
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Apply MinMax scaling to float columns
float_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Save the full train dataset
radi.to_csv("train_full_prep1.csv", index=False)

# Split into 80% train and 20% test
radi_train, radi_val = train_test_split(radi, test_size=0.2, random_state=25,stratify=radi['Transition'])
radi_train.to_csv("train_split_prep1.csv", index=False)
radi_val.to_csv("test_split_prep1.csv", index=False)

# Process the test dataset
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_test.drop(columns=[col for col in radi_test.columns if radi_test[col].dtype == 'object' and col != 'Transition'], inplace=True)

# Apply MinMax scaling to the test dataset
radi_test[float_cols] = scaler.transform(radi_test[float_cols])  # Use the same scaler fit on the train dataset
radi_test.to_csv("test_processed_prep1.csv", index=False)

print("Datasets processed and saved:")
print("- train_full.csv")
print("- train_split.csv")
print("- test_split.csv")
print("- test_processed.csv")


Dropped 16 non-numeric columns.
Datasets processed and saved:
- train_full.csv
- train_split.csv
- test_split.csv
- test_processed.csv


In [5]:
radi_train.head()

Unnamed: 0,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
121,0.0,0.21362,0.0,0.314961,0.335558,0.0,0.508006,0.341425,0.340244,0.427038,...,0.025589,0.601761,0.526909,0.221443,0.813066,0.758702,0.227083,0.0,0.560224,AD-AD
198,0.0,0.307972,0.0,0.433071,0.253251,0.0,0.848413,0.422372,0.28106,0.172539,...,0.550955,0.128763,0.086476,0.731297,0.2773,0.175126,0.742073,0.0,0.481793,AD-AD
60,0.0,0.557907,0.0,0.637795,0.726899,0.0,0.266324,0.205972,0.222178,0.517104,...,0.685323,0.332006,0.34512,0.267696,0.373086,0.246672,0.271671,1.0,0.577031,CN-CN
183,0.0,0.879452,0.0,0.031496,0.668891,0.0,0.726613,0.238604,0.165218,0.337588,...,0.502064,0.396552,0.52326,0.165003,0.583905,0.464665,0.169135,1.0,0.210084,MCI-MCI
136,0.0,0.353786,0.0,0.519685,0.426762,0.0,0.638138,0.238391,0.180202,0.36545,...,0.387723,0.343192,0.211267,0.453709,0.366279,0.240928,0.471903,1.0,0.708683,CN-CN


In [1]:
radi_test.head()

NameError: name 'radi_test' is not defined