In [1]:
!pip install pyradiomics SimpleITK

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyradiomics
  Downloading pyradiomics-3.0.1.tar.gz (34.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting SimpleITK
  Downloading SimpleITK-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting pykwalify>=1.6.0
  Downloading pykwalify-1.8.0-py2.py3-none-any.whl (24 kB)
Collecting docopt>=0.6.2
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ruamel.yaml>=0.16.0
  Downloading ruamel.yaml-0.17.24-py3-none-any.whl (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.0/109.0 kB[0m [31m2.0 MB/s[0m eta [36m0:

In [32]:
import sys
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv

import radiomics
from radiomics import featureextractor
import SimpleITK as sitk

from google.colab import drive
drive.mount('/content/drive')
%matplotlib inline

radiomics.setVerbosity(40)

seed = 42
DATA_PATH = "drive/MyDrive/Colab Notebooks/CT-PET Classifier/data"
# DATA_PATH = "drive/MyDrive/Colab Notebooks/CT-PET Classifier"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
def load_haralick_features(extractor):
  if os.path.exists(f"{DATA_PATH}/haralick.csv"):
    data = pd.read_csv(f"{DATA_PATH}/haralick.csv", index_col=0)
    labels = data.pop("y")
    return data, labels

  features, labels = [], []
  with open(f"{DATA_PATH}/file_paths.csv", "r") as f:
    reader = csv.reader(f)
    next(reader)
    for imagepath, maskpath in reader:
      image = sitk.ReadImage(imagepath)
      mask = sitk.ReadImage(maskpath)
      result = extractor.execute(image, mask)

      features.append(np.array(list(result.values())))
      labels.append(0 if "BENIGN" in imagepath else 1)
  
  return pd.DataFrame(features, columns=result.keys()), pd.Series(labels)

### Classical Features

In [None]:
def load_features():
  data, labels = [], []
  for sbj in range(100):
    if sbj % 10 == 9:
      print(f"{sbj + 1} / 100 patients loaded")

    if os.path.exists(f"{DATA_PATH}/CLASS1_MALIGNANT/LCp{sbj:04}_biobank"):
      label = "CLASS1_MALIGNANT"
    elif os.path.exists(f"{DATA_PATH}/CLASS2_BENIGN/LCp{sbj:04}_biobank"):
      label = "CLASS2_BENIGN"
    else:
      continue

    print(f"{DATA_PATH}/{label}/LCp{sbj:04}_biobank/LCp{sbj:04}")
    continue

    pet = nib.load(f"{DATA_PATH}/{label}/LCp{sbj:04}_biobank/LCp{sbj:04}_PT_partition.nii")
    ct = nib.load(f"{DATA_PATH}/{label}/LCp{sbj:04}_biobank/LCp{sbj:04}_CT_partition.nii")

    pet_res = pet.header["pixdim"][1:4]
    ct_res = ct.header["pixdim"][1:4]

    pet_scan = np.array(pet.dataobj) 
    ct_scan = np.array(ct.dataobj)

    pet_mask = np.array(nib.load(f"{DATA_PATH}/{label}/LCp{sbj:04}_biobank/LCp{sbj:04}_PT_mask.nii").dataobj)
    ct_mask = np.array(nib.load(f"{DATA_PATH}/{label}/LCp{sbj:04}_biobank/LCp{sbj:04}_CT_mask.nii").dataobj)

    # Calculate patient features
    pet_vol = pet_res.prod() * pet_mask.sum()
    ct_vol = ct_res.prod() * ct_mask.sum()
    mean_glucose = pet_scan[pet_mask != 0].mean()
    data.append((pet_vol, ct_vol, mean_glucose))
    labels.append(1 if label == "CLASS1_MALIGNANT" else 0)
  
  return np.array(data), np.array(labels)

## Classical ML approaches

In [53]:
extractor = featureextractor.RadiomicsFeatureExtractor()
extractor.disableAllFeatures()
extractor.enableFeatureClassByName("glcm")
extractor.addProvenance(False)
data, labels = load_haralick_features(extractor)

INFO:radiomics.featureextractor:No valid config parameter, using defaults: {'minimumROIDimensions': 2, 'minimumROISize': None, 'normalize': False, 'normalizeScale': 1, 'removeOutliers': None, 'resampledPixelSpacing': None, 'interpolator': 'sitkBSpline', 'preCrop': False, 'padDistance': 5, 'distances': [1], 'force2D': False, 'force2Ddimension': 0, 'resegmentRange': None, 'label': 1, 'additionalInfo': True}
INFO:radiomics.featureextractor:Enabled image types: {'Original': {}}
INFO:radiomics.featureextractor:Enabled features: {'firstorder': [], 'glcm': [], 'gldm': [], 'glrlm': [], 'glszm': [], 'ngtdm': [], 'shape': []}


In [52]:
data.dtypes

original_glcm_Autocorrelation       float64
original_glcm_ClusterProminence     float64
original_glcm_ClusterShade          float64
original_glcm_ClusterTendency       float64
original_glcm_Contrast              float64
original_glcm_Correlation           float64
original_glcm_DifferenceAverage     float64
original_glcm_DifferenceEntropy     float64
original_glcm_DifferenceVariance    float64
original_glcm_Id                    float64
original_glcm_Idm                   float64
original_glcm_Idmn                  float64
original_glcm_Idn                   float64
original_glcm_Imc1                  float64
original_glcm_Imc2                  float64
original_glcm_InverseVariance       float64
original_glcm_JointAverage          float64
original_glcm_JointEnergy           float64
original_glcm_JointEntropy          float64
original_glcm_MCC                   float64
original_glcm_MaximumProbability    float64
original_glcm_SumAverage            float64
original_glcm_SumEntropy        

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

In [39]:
def print_metrics(y_true, y_pred, model_name) -> None:
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print(model_name)
    print(f"Sensitivity: {tp / (tp + fn) * 100:.1f}%")
    print(f"Specificity: {tn / (tn + fp) * 100:.1f}%")
    print(f"Accuracy: {accuracy_score(y_true, y_pred) * 100:.1f}%")
    print(f"ROC-AUC: {roc_auc_score(y_true, y_pred) * 100:.1f}%")
    print("Confusion Matrix:")
    print(tn, fp)
    print(fn, tp)
    print()

In [54]:
svc = SVC(kernel="linear", random_state=seed)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=seed)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print_metrics(y_test, y_pred, "Linear SVC")

KeyboardInterrupt: ignored

In [41]:
rf = RandomForestClassifier(criterion="entropy", random_state=seed)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print_metrics(y_test, y_pred, "Random Forest")

Random Forest
Sensitivity: 92.9%
Specificity: 100.0%
Accuracy: 95.8%
ROC-AUC: 96.4%
Confusion Matrix:
10 0
1 13



In [47]:
from sklearn.decomposition import PCA,KernelPCA
pca=PCA(0.99)
pca.fit(data)

In [51]:
data

Unnamed: 0,original_glcm_Autocorrelation,original_glcm_ClusterProminence,original_glcm_ClusterShade,original_glcm_ClusterTendency,original_glcm_Contrast,original_glcm_Correlation,original_glcm_DifferenceAverage,original_glcm_DifferenceEntropy,original_glcm_DifferenceVariance,original_glcm_Id,...,original_glcm_InverseVariance,original_glcm_JointAverage,original_glcm_JointEnergy,original_glcm_JointEntropy,original_glcm_MCC,original_glcm_MaximumProbability,original_glcm_SumAverage,original_glcm_SumEntropy,original_glcm_SumSquares,y
0,1.000000,0.000000e+00,0.000000,0.000000,0.000000,1.000000,0.000000,-3.203427e-16,0.000000,1.000000,...,0.000000,1.000000,1.000000,-3.203427e-16,1.000000,1.000000,2.000000,-3.203427e-16,0.000000,0
1,276.389664,8.519291e+03,-236.394185,57.277441,23.209608,0.411545,3.588747,3.219321e+00,8.666041,0.364567,...,0.268240,16.363306,0.008151,7.478332e+00,0.476557,0.026394,32.726612,4.767019e+00,20.121762,0
2,1.000000,0.000000e+00,0.000000,0.000000,0.000000,1.000000,0.000000,-3.203427e-16,0.000000,1.000000,...,0.000000,1.000000,1.000000,-3.203427e-16,1.000000,1.000000,2.000000,-3.203427e-16,0.000000,0
3,106.206222,2.391478e+02,-22.500014,6.424266,2.893222,0.371838,1.163500,2.034986e+00,1.520185,0.602747,...,0.447635,10.262562,0.069975,4.655313e+00,0.411430,0.180815,20.525125,3.167033e+00,2.329372,0
4,1.000000,0.000000e+00,0.000000,0.000000,0.000000,1.000000,0.000000,-3.203427e-16,0.000000,1.000000,...,0.000000,1.000000,1.000000,-3.203427e-16,1.000000,1.000000,2.000000,-3.203427e-16,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,669.515907,2.816160e+04,-936.035214,67.098914,20.692854,0.515699,2.883921,3.066245e+00,11.507501,0.461716,...,0.334384,25.648240,0.024774,6.808630e+00,0.591413,0.078901,51.296480,4.515170e+00,21.947942,1
116,1.000000,0.000000e+00,0.000000,0.000000,0.000000,1.000000,0.000000,-3.203427e-16,0.000000,1.000000,...,0.000000,1.000000,1.000000,-3.203427e-16,1.000000,1.000000,2.000000,-3.203427e-16,0.000000,1
117,255.284665,4.104076e+02,-25.504637,6.656902,3.348914,0.318779,1.164550,1.979078e+00,1.868679,0.612233,...,0.451702,15.951690,0.069955,4.684613e+00,0.391474,0.146813,31.903381,3.187460e+00,2.501454,1
118,1.000000,0.000000e+00,0.000000,0.000000,0.000000,1.000000,0.000000,-3.203427e-16,0.000000,1.000000,...,0.000000,1.000000,1.000000,-3.203427e-16,1.000000,1.000000,2.000000,-3.203427e-16,0.000000,1


In [45]:
pca.feature_names_in_

array(['original_glcm_Autocorrelation', 'original_glcm_ClusterProminence',
       'original_glcm_ClusterShade', 'original_glcm_ClusterTendency',
       'original_glcm_Contrast', 'original_glcm_Correlation',
       'original_glcm_DifferenceAverage',
       'original_glcm_DifferenceEntropy',
       'original_glcm_DifferenceVariance', 'original_glcm_Id',
       'original_glcm_Idm', 'original_glcm_Idmn', 'original_glcm_Idn',
       'original_glcm_Imc1', 'original_glcm_Imc2',
       'original_glcm_InverseVariance', 'original_glcm_JointAverage',
       'original_glcm_JointEnergy', 'original_glcm_JointEntropy',
       'original_glcm_MCC', 'original_glcm_MaximumProbability',
       'original_glcm_SumAverage', 'original_glcm_SumEntropy',
       'original_glcm_SumSquares', 'y'], dtype=object)

In [35]:
haralick = data
haralick["y"] = labels
haralick.to_csv(f"{DATA_PATH}/haralick.csv")