In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo


set_seed(25)

In [3]:
# Load datasets
radi = pd.read_csv("train_full_prep5.csv") # começar da prep 5 !
radi_test = pd.read_csv("test_processed_prep5.csv") # # começar da prep 5 !


In [4]:
y = radi["Transition"]
class_counts = y.value_counts()
print(class_counts)

Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64


In [5]:

# Separate features and target
X = radi.drop(columns=["Transition"])
y = radi["Transition"]

# Apply SMOTE
smote = SMOTE(random_state=25)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after applying SMOTE
print("Original class distribution:")
print(y.value_counts())

print("\nResampled class distribution:")
print(pd.Series(y_resampled).value_counts())


Original class distribution:
Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64

Resampled class distribution:
Transition
CN-CN      96
AD-AD      96
CN-MCI     96
MCI-AD     96
MCI-MCI    96
Name: count, dtype: int64


In [6]:
# Combine X_resampled and y_resampled into a single DataFrame
resampled_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), 
                            pd.Series(y_resampled, name="Transition")], axis=1)


In [7]:
resampled_data.head()

Unnamed: 0,Age,wavelet-HHH_glcm_SumSquares,wavelet-HHH_gldm_DependenceEntropy,wavelet-HHH_gldm_DependenceNonUniformity,wavelet-HHH_gldm_DependenceVariance,wavelet-HHH_gldm_GrayLevelVariance,wavelet-HHH_gldm_HighGrayLevelEmphasis,wavelet-HHH_gldm_LargeDependenceEmphasis,wavelet-HHH_gldm_LowGrayLevelEmphasis,wavelet-HHH_gldm_SmallDependenceEmphasis,...,log-sigma-5-0-mm-3D_glcm_SumSquares,wavelet-HLH_glcm_Idn,logarithm_firstorder_Variance,wavelet-HLH_glrlm_LongRunHighGrayLevelEmphasis,wavelet-HLL_glszm_SizeZoneNonUniformityNormalized,wavelet-LLL_glcm_JointEntropy,square_gldm_LargeDependenceEmphasis,wavelet-HHH_firstorder_Variance,lbp-3D-m2_glrlm_LongRunLowGrayLevelEmphasis,Transition
0,0.610644,0.610901,0.480943,0.580378,0.500774,0.614496,0.802767,0.883296,0.197233,0.181651,...,0.463114,0.070429,0.098761,0.114143,0.234412,0.382456,0.401727,0.118693,0.588281,CN-CN
1,0.787115,0.997724,0.149889,0.735681,0.130872,0.994018,0.519986,0.731747,0.480014,0.099057,...,0.338657,0.021656,0.064365,0.083615,0.455309,0.221619,0.261642,0.165755,0.910061,CN-CN
2,0.728291,0.969676,0.615477,0.478993,0.526055,0.977648,0.401968,0.325272,0.598032,0.553046,...,0.939358,0.131325,1.0,0.063101,0.230525,0.950604,0.306903,0.371362,0.167578,AD-AD
3,0.680672,0.993088,0.209982,0.808283,0.224884,0.982898,0.411712,0.778498,0.588288,0.14226,...,0.611031,0.045084,0.052615,0.089171,0.370317,0.305336,0.811043,0.250621,0.701622,CN-MCI
4,0.694678,0.973374,0.260323,0.43002,0.229915,0.959592,0.37517,0.616315,0.62483,0.235611,...,0.616082,0.069653,0.015879,0.070012,0.6155,0.101869,0.948325,0.039205,0.431275,CN-CN


In [8]:
radi_test.head()

Unnamed: 0,Age,wavelet-HHH_glcm_SumSquares,wavelet-HHH_gldm_DependenceEntropy,wavelet-HHH_gldm_DependenceNonUniformity,wavelet-HHH_gldm_DependenceVariance,wavelet-HHH_gldm_GrayLevelVariance,wavelet-HHH_gldm_HighGrayLevelEmphasis,wavelet-HHH_gldm_LargeDependenceEmphasis,wavelet-HHH_gldm_LowGrayLevelEmphasis,wavelet-HHH_gldm_SmallDependenceEmphasis,...,log-sigma-5-0-mm-3D_glszm_SizeZoneNonUniformityNormalized,log-sigma-5-0-mm-3D_glcm_SumSquares,wavelet-HLH_glcm_Idn,logarithm_firstorder_Variance,wavelet-HLH_glrlm_LongRunHighGrayLevelEmphasis,wavelet-HLL_glszm_SizeZoneNonUniformityNormalized,wavelet-LLL_glcm_JointEntropy,square_gldm_LargeDependenceEmphasis,wavelet-HHH_firstorder_Variance,lbp-3D-m2_glrlm_LongRunLowGrayLevelEmphasis
0,0.829132,0.987814,0.243898,0.556744,0.232205,0.999434,0.492122,0.605012,0.507878,0.191339,...,0.016754,0.573948,0.044963,0.057979,0.068132,0.581859,0.460535,0.540937,0.295796,0.470842
1,0.478992,0.85101,0.207867,0.77441,0.178449,0.759688,0.224713,0.717771,0.775287,0.112027,...,0.343457,0.334026,0.047994,0.068466,0.097288,0.170989,0.370086,0.363712,0.095451,0.854966
2,0.456583,0.894222,0.49318,0.672719,0.473686,0.992289,0.525433,0.856318,0.474567,0.292658,...,0.20642,0.617251,0.074171,0.094331,0.099454,0.379891,0.432248,0.697082,0.105147,0.602285
3,0.59944,0.895957,0.45403,0.783722,0.398621,0.988201,0.536259,0.765404,0.463741,0.270362,...,0.234556,0.628901,0.060898,0.069565,0.110873,0.287961,0.248202,0.810975,0.29116,0.670455
4,0.431373,0.61183,0.172753,0.646559,0.151513,0.870936,0.666652,0.666384,0.333348,0.171687,...,0.385135,0.724864,0.012155,0.072304,0.073087,0.573106,0.370051,0.676229,0.26448,0.625845


In [9]:
# Export to CSV
resampled_data.to_csv("train_full_prep6.csv", index=False)
radi_test.to_csv("test_processed_prep6.csv",index=False) 