In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats


# Load dataset
radi = pd.read_csv("sbsppdaa24/train_radiomics_hipocamp.csv")

# Drop unique identifier columns
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Drop columns where all entries are the same
same_value_cols = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols, inplace=True)
print(f"Dropped {len(same_value_cols)} columns with the same value for every entry.")

# Define the features (excluding the target variable)
features = radi.drop(columns=['Transition'])

radi.info()

Dropped 16 non-numeric columns.
Dropped 148 columns with the same value for every entry.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2014 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(1994), int64(19), object(1)
memory usage: 4.7+ MB


In [24]:


# Calculate Z-scores for numeric features only
z_scores = stats.zscore(features)

# Set a threshold for identifying outliers
threshold = 7

# Create a mask for values with Z-scores within the threshold
mask = (abs(z_scores) < threshold).all(axis=1)

# Filter the DataFrame
radi_no_outliers = radi[mask]
print(f"Removed {len(radi) - len(radi_no_outliers)} outliers using Z-Score method.")

# Final DataFrame info
radi = radi_no_outliers.copy()
radi.info()

Removed 27 outliers using Z-Score method.
<class 'pandas.core.frame.DataFrame'>
Index: 278 entries, 0 to 304
Columns: 2014 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(1994), int64(19), object(1)
memory usage: 4.3+ MB


In [22]:
## Normalization 

# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler on the feature columns and transform the data
radi[features.columns] = scaler.fit_transform(radi[features.columns])

# Display the final DataFrame info
radi.info()


Unnamed: 0,mean,std,min,50%,max
diagnostics_Image-original_Mean,5.810469,0.613858,4.321602,5.811449,7.583210
diagnostics_Image-original_Maximum,173.504918,24.108871,128.000000,172.000000,255.000000
diagnostics_Mask-original_VoxelNum,6714.790164,1066.815670,3609.000000,6723.000000,9453.000000
original_shape_Elongation,0.411819,0.028993,0.330890,0.412479,0.491496
original_shape_Flatness,0.096916,0.009739,0.075142,0.096609,0.144082
...,...,...,...,...,...
original_glszm_LargeAreaLowGrayLevelEmphasis,14307.840225,6655.077498,3481.176680,13200.374011,62858.129774
original_glszm_LowGrayLevelZoneEmphasis,0.256322,0.088366,0.167330,0.236614,0.738095
original_glszm_SizeZoneNonUniformity,22.814045,5.360075,11.033333,22.282051,46.500000
original_glszm_SizeZoneNonUniformityNormalized,0.268504,0.046213,0.155584,0.262518,0.438965


In [None]:
## Feature Selection ? 