<a href="https://colab.research.google.com/github/crystaljwang/tm10007_group_3/blob/main/main_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

The code below loads the GIST data from GitHub.

In [60]:
# Run this to use from colab environment
!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [61]:
# Run this to use from colab environment
!git clone https://github.com/jveenland/tm10007_ml.git

fatal: destination path 'tm10007_ml' already exists and is not an empty directory.


In [62]:
%cd /content/tm10007_ml/worcgist

/content/tm10007_ml/worcgist


In [63]:
# ----- Import necessary libraries -----

from pathlib import Path
import pandas as pd
import math

from scipy.stats import shapiro 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA

In [64]:
# ---- Import data -----

dir = Path('.') / 'GIST_radiomicFeatures.csv'
data = pd.read_csv(dir, index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

The number of samples: 246
The number of columns: 494


# Splitting the data

In [65]:
# Replace label values from string to binary
data['label'] = data['label'].replace({'GIST': 1, 'non-GIST': 0})

# Separate the features and labels
X = data.drop(['label'], axis=1)
y = data['label']

# Split the data into random train and test sets
X_train_tot, X_test, y_train_tot, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tot, y_train_tot, test_size=0.15, random_state=42)
print(X_train.shape)
print(X_val.shape)

(166, 493)
(30, 493)


# Exploring the data

In [66]:
# Percentage GIST vs Non-GIST
counts = y_train.value_counts(normalize=True)
percentage_nongist = counts[0] * 100
percentage_gist = counts[1] * 100

print(f'Percentage of non-GIST in training set: {percentage_nongist:.2f}%')
print(f'Percentage of GIST in training set: {percentage_gist:.2f}%')

# Check for missing data
if X_train.isnull().sum().sum() > 0:
    print('Missing data found.')
    exit()
else:
    print('No missing data found.')

# Check for categorial values
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
if len(categorical_cols) > 0:
    print(f'Categorical columns found: {categorical_cols}')
    exit()
else:
    print('No categorical columns found.')

# Perform Shapiro-Wilk test for normality
p_values_above_threshold = []
for col in X_train.columns:
  stat, p = shapiro((X_train))
  p_values_above_threshold.append(p > 0.05)
  #print(f'Shapiro test for column {col}: statistic = {stat:.3f}, p-value = {p:.3f}')

percent_above_threshold = sum(p_values_above_threshold) / len(p_values_above_threshold) * 100
print(f'{percent_above_threshold:.1f} percent of the data is normally distributed.')

#print('stat=%.3f, p=%.3f\n' % (stat, p))

Percentage of non-GIST in training set: 51.20%
Percentage of GIST in training set: 48.80%
No missing data found.
No categorical columns found.




0.0 percent of the data is normally distributed.


# Preprocessing

In [67]:
# ----- Outliers -----

def replace_outliers(data):
    """
    Replaces the outliers in a DataFrame with the lower or upper bound.

    :param data: The DataFrame to be filtered
    :return: A new DataFrame with the outliers replaced by the lower or upper bound for each column
    """
    # Calculate the lower and upper bounds based on each column's median and interquartile range
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Replace the outliers with the lower or upper bound
    for col in data.columns:
        data[col] = data[col].apply(lambda x: upper_bound[col] if x > upper_bound[col] else x)
        data[col] = data[col].apply(lambda x: lower_bound[col] if x < lower_bound[col] else x)

    return data

# Replace the outliers in each column with the lower or upper bound
replace_outliers(X_train)

Unnamed: 0_level_0,PREDICT_original_sf_compactness_avg_2.5D,PREDICT_original_sf_compactness_std_2.5D,PREDICT_original_sf_rad_dist_avg_2.5D,PREDICT_original_sf_rad_dist_std_2.5D,PREDICT_original_sf_roughness_avg_2.5D,PREDICT_original_sf_roughness_std_2.5D,PREDICT_original_sf_convexity_avg_2.5D,PREDICT_original_sf_convexity_std_2.5D,PREDICT_original_sf_cvar_avg_2.5D,PREDICT_original_sf_cvar_std_2.5D,...,PREDICT_original_phasef_phasesym_median_WL3_N5,PREDICT_original_phasef_phasesym_std_WL3_N5,PREDICT_original_phasef_phasesym_skewness_WL3_N5,PREDICT_original_phasef_phasesym_kurtosis_WL3_N5,PREDICT_original_phasef_phasesym_peak_WL3_N5,PREDICT_original_phasef_phasesym_peak_position_WL3_N5,PREDICT_original_phasef_phasesym_range_WL3_N5,PREDICT_original_phasef_phasesym_energy_WL3_N5,PREDICT_original_phasef_phasesym_quartile_range_WL3_N5,PREDICT_original_phasef_phasesym_entropy_WL3_N5
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GIST-241_0,0.919711,0.026795,13.872294,1.531448,1.844892,0.561529,0.986238,0.013349,0.012755,0.004150,...,0.0,0.060505,3.479130,13.489012,0.0,0,0.244214,16.075934,0.0,9.110960
GIST-134_0,0.875846,0.046112,18.666025,1.878233,4.182247,1.093752,0.967428,0.015162,0.012247,0.011375,...,0.0,0.030719,6.827330,52.495437,0.0,0,0.119416,7.777367,0.0,8.269621
GIST-075_0,0.632667,0.205854,37.421093,8.675805,8.813201,8.584728,0.876926,0.143398,0.052784,0.027349,...,0.0,0.112046,2.807397,8.440284,0.0,0,0.426226,929.025179,0.0,13.471238
GIST-030_0,0.890580,0.045457,35.839782,3.276323,5.932870,2.562711,0.973856,0.019543,0.009805,0.006012,...,0.0,0.025844,8.598213,86.667692,0.0,0,0.080184,85.701057,0.0,11.719822
GIST-014_0,0.847617,0.070240,21.654106,2.512771,5.214407,1.648799,0.965204,0.021454,0.016304,0.010681,...,0.0,0.036298,4.665801,24.509730,0.0,0,0.156680,37.248314,0.0,10.983970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GIST-082_0,0.659215,0.218467,29.190690,6.532695,11.847639,6.067955,0.878442,0.129774,0.055453,0.043879,...,0.0,0.072068,3.053260,8.762275,0.0,0,0.300532,485.962430,0.0,13.362151
GIST-127_0,0.719563,0.156002,58.697289,10.151337,8.665013,7.582687,0.909120,0.107587,0.034021,0.017337,...,0.0,0.051366,4.993405,28.079970,0.0,0,0.224431,866.211849,0.0,14.384736
GIST-136_0,0.815825,0.111567,11.809252,1.804329,5.776660,1.047962,0.961775,0.043180,0.027073,0.016079,...,0.0,0.143213,1.272648,0.426010,0.0,0,0.468745,122.985385,0.0,10.516433
GIST-100_0,0.842369,0.084674,25.574073,2.370240,6.023072,2.500956,0.957884,0.027907,0.010805,0.006488,...,0.0,0.060656,4.100240,18.881629,0.0,0,0.256453,80.430861,0.0,11.084788


In [68]:
# Data scaling
scaler = MinMaxScaler()  # define scaler
# scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)  # fit scaler on train set
X_val = scaler.transform(X_val)  # apply fitted scaler on validation set
X_test = scaler.transform(X_test)  # apply fitted scaler on test set

In [69]:
# Remove all constant (zero-variance) features
X_train = pd.DataFrame(X_train)
zero_var_filter = VarianceThreshold(threshold=0)

# Fit on train data
zero_var_filter.fit(X_train)
zero_var_columns = [column for column in X_train.columns if column not in X_train.columns[zero_var_filter.get_support()]]

# Apply on validation and test data
X_train = zero_var_filter.transform(X_train)
X_val = zero_var_filter.transform(X_val)
X_test = zero_var_filter.transform(X_test)

In [70]:
# ----- Feature selection -----

lasso_selector = SelectFromModel(estimator=Lasso(alpha=10**(-10), max_iter=1000), threshold='median')
lasso_selector.fit(X_train, y_train)
lasso_list = [column for column in pd.DataFrame(X_train).columns[lasso_selector.get_support()]]
n_original = X_train.shape[1]

X_train = lasso_selector.transform(X_train)
n_selected = X_train.shape[1]
print(f"Selected {n_selected} from {n_original} features.")

Selected 228 from 455 features.


In [71]:
X_train.shape

(166, 228)

In [72]:
# ----- Feature extraction -----

from sklearn.decomposition import PCA

# 95% variance
pca = PCA(n_components = 0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)

print(f"Selected {X_train.shape[1]} features to be used for classification.")

Selected 50 features to be used for classification.
