<a href="https://colab.research.google.com/github/crystaljwang/tm10007_group_3/blob/preprocessing/ML_Julia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The code below loads the GIST data from GitHub. Some imports and settings.

In [38]:
## Import data

# Run this to use from colab environment
!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

# Run this to use from colab environment
!git clone https://github.com/jveenland/tm10007_ml.git 

%cd /content/tm10007_ml/worcgist   

  Preparing metadata (setup.py) ... [?25l[?25hdone
Cloning into 'tm10007_ml'...
remote: Enumerating objects: 83, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 83 (delta 13), reused 12 (delta 12), pack-reused 61[K
Unpacking objects: 100% (83/83), 67.93 MiB | 15.59 MiB/s, done.
/content/tm10007_ml/worcgist


General imports, settings and calculations

In [39]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import metrics

# Save directory
dir = Path('.') / 'GIST_radiomicFeatures.csv'
data = pd.read_csv(dir, index_col=0)

# Size dataset
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

The number of samples: 246
The number of columns: 494


Preprocessing the data

In [40]:
##PREPROCESSING

# Replace label values from string to binary
data['label'] = data['label'].replace({'GIST': 1, 'non-GIST': 0})

# Separate the features and labels
X = data.drop(['label'], axis=1)
y = data['label']

# Data scaling
X = StandardScaler().fit_transform(X)

Splitting the data

In [41]:
# ----- SPLITTING THE DATA -----

# Split the data into random train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

# Make dataframe
X_train = pd.DataFrame(X_train)

(196, 493)
(50, 493)


Testing

In [42]:
# Percentage Gist vs Non-Gist
counts = y_train.value_counts(normalize=True)
percentage_nongist = counts[0] * 100
percentage_gist = counts[1] * 100

print(f'Percentage of non-GIST in training set: {percentage_nongist:.2f}%')
print(f'Percentage of GIST in training set: {percentage_gist:.2f}%')

# Check for missing data
if X_train.isnull().sum().sum() > 0:
    print('Missing data found.')
    exit()
else:
    print('No missing data found.')

# Check for categorial values
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
if len(categorical_cols) > 0:
    print(f'Categorical columns found: {categorical_cols}')
    exit()
else:
    print('No categorical columns found.')

import math
from scipy.stats import shapiro 

# Perform Shapiro-Wilk test for normality
p_values_above_threshold = []
for col in X_train.columns:
  stat, p = shapiro((X_train))
  p_values_above_threshold.append(p > 0.05)
  #print(f'Shapiro test for column {col}: statistic = {stat:.3f}, p-value = {p:.3f}')

percent_above_threshold = sum(p_values_above_threshold) / len(p_values_above_threshold) * 100
print(f'{percent_above_threshold:.1f} percent of the data is normally distributed.')

#print('stat=%.3f, p=%.3f\n' % (stat, p))


Percentage of non-GIST in training set: 51.02%
Percentage of GIST in training set: 48.98%
No missing data found.
No categorical columns found.




0.0 percent of the data is normally distributed.


In [None]:
# Visualisation outliers

import seaborn as sns
sns.boxplot(X_train[5])

Outliers

In [None]:
def replace_outliers(data):
    """
    Replaces the outliers in a DataFrame with the lower or upper bound.

    :param data: The DataFrame to be filtered
    :return: A new DataFrame with the outliers replaced by the lower or upper bound for each column
    """
    # Calculate the lower and upper bounds based on each column's median and interquartile range
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Replace the outliers with the lower or upper bound
    for col in data.columns:
        data[col] = data[col].apply(lambda x: upper_bound[col] if x > upper_bound[col] else x)
        data[col] = data[col].apply(lambda x: lower_bound[col] if x < lower_bound[col] else x)

    return data

# Replace the outliers in each column with the lower or upper bound
replace_outliers(X_train)

#sns.boxplot(replaced_df[5])
