In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [3]:
# Load datasets
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")


In [4]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Drop unique identifier columns
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)
print("Dropped unique identifier columns from training dataset.")

# Drop columns where all entries are the same
same_value_cols_train = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols_train, inplace=True)
print(f"Dropped {len(same_value_cols_train)} constant-value columns from training dataset.")

# Separate target feature (Transition) from the dataset
target = radi['Transition']
radi.drop(columns=['Transition'], inplace=True)

# Identify non-numeric columns
non_numeric_cols = radi.select_dtypes(include=['object', 'category']).columns
print(f"Non-numeric columns identified: {list(non_numeric_cols)}")

# Drop or encode non-numeric columns
radi.drop(columns=non_numeric_cols, inplace=True)

# Apply MinMax scaling to numeric columns
numeric_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[numeric_cols] = scaler.fit_transform(radi[numeric_cols])
print("Scaled numeric columns using MinMaxScaler.")

# PCA transformation
pca = PCA(n_components=0.95)
radi_pca = pca.fit_transform(radi)

# Generate component names based on the most contributing original features
original_feature_names = radi.columns
components_with_original_features = [
    "+".join(original_feature_names[np.argsort(-np.abs(pca.components_[i]))[:3]])
    for i in range(radi_pca.shape[1])
]

# Convert PCA result to DataFrame using the generated names
radi_pca_df = pd.DataFrame(radi_pca, columns=components_with_original_features)

radi_pca_df = pd.concat([radi_pca_df, target.reset_index(drop=True)], axis=1)
# Check final data structure
print("Final dataset structure:")
radi_pca_df.info()

# train_full_prep4
radi_pca_df.to_csv("train_full_prep4.csv",index=False)


# Drop unique identifier columns
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True, errors='ignore')

# Drop the same constant-value columns as in the training data
radi_test.drop(columns=same_value_cols_train, inplace=True, errors='ignore')

# Drop the same non-numeric columns as in the training data
radi_test.drop(columns=non_numeric_cols, inplace=True, errors='ignore')

# Apply MinMax scaling to numeric columns in the test data using the previously fitted scaler
radi_test[numeric_cols] = scaler.transform(radi_test[numeric_cols])  # Use the same scaler as training

# Apply PCA transformation to the test data using the previously fitted PCA
radi_test_pca = pca.transform(radi_test)

# Convert PCA-transformed test data to DataFrame using the same column names as for the training data
radi_test_pca_df = pd.DataFrame(radi_test_pca, columns=components_with_original_features)

# Check final test dataset structure
print("Final test dataset structure:")
radi_test_pca_df.info()

# Save the processed test dataset
radi_test_pca_df.to_csv("test_processed_prep4.csv", index=False)


Dropped unique identifier columns from training dataset.
Dropped 159 constant-value columns from training dataset.
Non-numeric columns identified: ['diagnostics_Image-original_Hash', 'diagnostics_Mask-original_Hash', 'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_CenterOfMassIndex', 'diagnostics_Mask-original_CenterOfMass']
Scaled numeric columns using MinMaxScaler.
Final dataset structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 87 columns):
 #   Column                                                                                                                                                           Non-Null Count  Dtype  
---  ------                                                                                                                                                           --------------  -----  
 0   lbp-3D-m1_firstorder_Median+log-sigma-2-0-mm-3D_glcm_ClusterShade+log-sigma-2-0-mm-3D_glrlm_Sho