In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats


In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [3]:
# Load dataset
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")

In [4]:

# Drop unique identifier columns
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Drop columns where all entries are the same
same_value_cols = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols, inplace=True)
print(f"Dropped {len(same_value_cols)} columns with the same value for every entry.")

# Define the features (excluding the target variable)
features = radi.drop(columns=['Transition'])

radi.info()

# Calculate Z-scores for numeric features only
z_scores = stats.zscore(features)

# Set a threshold for identifying outliers
threshold = 7

# Create a mask for values with Z-scores within the threshold
mask = (abs(z_scores) < threshold).all(axis=1)

# Filter the DataFrame
radi_no_outliers = radi[mask]
print(f"Removed {len(radi) - len(radi_no_outliers)} outliers using Z-Score method.")

# Final DataFrame info
radi = radi_no_outliers.copy()
radi.info()

## Normalization 
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler on the feature columns and transform the data
radi[features.columns] = scaler.fit_transform(radi[features.columns])

# Display the final DataFrame info
radi.info()
radi.to_csv("train_full_prep2.csv", index=False)

## to csv
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_test.drop(columns=columns_to_drop, inplace=True)
radi_test.drop(columns=same_value_cols, inplace=True)
radi_test[features.columns] = scaler.transform(radi_test[features.columns])
radi_test.info()
radi_test.to_csv("test_processed_prep2.csv", index=False)


Dropped 16 non-numeric columns.
Dropped 148 columns with the same value for every entry.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2014 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(1994), int64(19), object(1)
memory usage: 4.7+ MB
Removed 27 outliers using Z-Score method.
<class 'pandas.core.frame.DataFrame'>
Index: 278 entries, 0 to 304
Columns: 2014 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(1994), int64(19), object(1)
memory usage: 4.3+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 278 entries, 0 to 304
Columns: 2014 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(2013), object(1)
memory usage: 4.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 2013 entries, diagnostics_Image-original_Mean to Age
dtypes: float64(2013)
memory usage: 1.5 MB
