In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats


# Load dataset
radi = pd.read_csv("sbsppdaa24/train_radiomics_hipocamp.csv")

# Drop unique identifier columns
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Drop columns where all entries are the same
same_value_cols = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols, inplace=True)
print(f"Dropped {len(same_value_cols)} columns with the same value for every entry.")

# Define the features (excluding the target variable)
features = radi.drop(columns=['Transition'])

radi.info()



# Calculate Z-scores for numeric features only
z_scores = stats.zscore(features)

# Set a threshold for identifying outliers
threshold = 7

# Create a mask for values with Z-scores within the threshold
mask = (abs(z_scores) < threshold).all(axis=1)

# Filter the DataFrame
radi_no_outliers = radi[mask]
print(f"Removed {len(radi) - len(radi_no_outliers)} outliers using Z-Score method.")

# Final DataFrame info
radi = radi_no_outliers.copy()
radi.info()

## Normalization 
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler on the feature columns and transform the data
radi[features.columns] = scaler.fit_transform(radi[features.columns])

# Display the final DataFrame info
radi.info()


In [None]:
## Feature Selection ? 