<a href="https://colab.research.google.com/github/crystaljwang/tm10007_group_3/blob/feature_extraction/feature_extraction_Crystal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The code below loads the GIST data from GitHub.

In [121]:
# Run this to use from colab environment
!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [122]:
# Run this to use from colab environment
!git clone https://github.com/jveenland/tm10007_ml.git

fatal: destination path 'tm10007_ml' already exists and is not an empty directory.


In [123]:
%cd /content/tm10007_ml/worcgist

/content/tm10007_ml/worcgist


In [124]:
# Import necessary libraries
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [125]:
dir = Path('.') / 'GIST_radiomicFeatures.csv'
data = pd.read_csv(dir, index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
data.info()

The number of samples: 246
The number of columns: 494
<class 'pandas.core.frame.DataFrame'>
Index: 246 entries, GIST-001_0 to GIST-246_0
Columns: 494 entries, label to PREDICT_original_phasef_phasesym_entropy_WL3_N5
dtypes: float64(468), int64(25), object(1)
memory usage: 951.3+ KB


In [126]:
# ----- PREPROCESSING -----

# Replace label values from string to binary
data['label'] = data['label'].replace({'GIST': 1, 'non-GIST': 0})

# Separate the features and labels
X = data.drop(['label'], axis=1)
y = data['label']

# Data scaling
X = StandardScaler().fit_transform(X)

In [127]:
# ----- SPLITTING THE DATA -----

# Split the data into random train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(196, 493)
(50, 493)


In [128]:
# ----- FEATURE SELECTION -----

# Remove all constant (zero-variance) features
X_train = pd.DataFrame(X_train)

zero_var_filter = VarianceThreshold(threshold=0)
zero_var_filter.fit(X_train)
zero_var_columns = [column for column in X_train.columns if column not in X_train.columns[zero_var_filter.get_support()]]
X_new = zero_var_filter.transform(X_train)

removed_features = [data.columns[index] for index in zero_var_columns]
print('The following constant features were removed:')
for feature in removed_features:
  print(f'- {feature}')

# Select features according to the k highest scores
X_kbest = SelectKBest(f_classif, k=10).fit_transform(X_new, y_train)

The following constant features were removed:
- PREDICT_original_tf_LBP_min_R3_P12
- PREDICT_original_tf_LBP_kurtosis_R3_P12
- PREDICT_original_tf_LBP_peak_R3_P12
- PREDICT_original_tf_LBP_min_R8_P24
- PREDICT_original_tf_LBP_peak_R8_P24
- PREDICT_original_tf_LBP_min_R15_P36
- PREDICT_original_tf_LBP_peak_R15_P36
- PREDICT_original_phasef_monogenic_entropy_WL3_N5
- PREDICT_original_phasef_phasecong_kurtosis_WL3_N5
- PREDICT_original_phasef_phasecong_peak_WL3_N5
- PREDICT_original_phasef_phasecong_entropy_WL3_N5
- PREDICT_original_phasef_phasesym_kurtosis_WL3_N5
- PREDICT_original_phasef_phasesym_peak_WL3_N5


In [129]:
X_kbest.shape

(196, 10)