<a href="https://colab.research.google.com/github/ebbilge/Lung-CA-CNN/blob/main/Lung_CA_Classification_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install python-gdcm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-gdcm
  Downloading python_gdcm-3.0.21-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-gdcm
Successfully installed python-gdcm-3.0.21


In [None]:
!pip install pydicom

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydicom
  Downloading pydicom-2.3.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.1


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os

import pydicom
import scipy.ndimage
import gdcm

import glob

from skimage import measure 
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
from skimage.morphology import disk, opening, closing
from tqdm import tqdm
from pathlib import Path
from IPython.display import HTML
from PIL import Image

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from os import listdir, mkdir

In [None]:
def load_scan(path):

  dir_path = path
  slices = []
  count=0
  for root, _, filenames in os.walk(dir_path):
    for filename in filenames:
      dcm_path = Path(root, filename)
      if dcm_path.suffix == ".dcm":
        try:
          dicom = pydicom.dcmread(dcm_path, force=True)
        except IOError as e:
          print(f"Can't import {dcm_path.stem}")
        else:
          slices.append(dicom)
          count+=1
    
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
        
  return slices, count

In [None]:
def set_outside_scanner_to_air(raw_pixelarrays):
    raw_pixelarrays[raw_pixelarrays <= -1000] = 0
    return raw_pixelarrays

In [None]:
def transform_to_hu(slices):
    images = np.stack([file.pixel_array for file in slices])
    images = images.astype(np.int16)

    images = set_outside_scanner_to_air(images)
    
    # convert to HU
    for n in range(len(slices)):
        
        intercept = slices[n].RescaleIntercept
        slope = slices[n].RescaleSlope
        
        if slope != 1:
            images[n] = slope * images[n].astype(np.float64)
            images[n] = images[n].astype(np.int16)
            
        images[n] += np.int16(intercept)
    
    return np.array(images, dtype=np.int16)

In [None]:
def preprocess_to_hu_scans(scan_properties, my_shape, output_dir):
    for i in range(5):
        pth = scan_properties.loc[i].patient_pth
        scans, count = load_scan(pth)
        print(i, count)
        hu_scans = transform_to_hu(scans) 
        prepared_scans = np.zeros((hu_scans.shape[0], my_shape[0], my_shape[1]), dtype=np.int16)
        hu_scans = hu_scans.astype(np.int32)
        for s in range(hu_scans.shape[0]): 
          prepared_scans[s] = resize_scan(hu_scans[s,:,:], my_shape)

        
        np.save(output_dir + "/" + str(i) + '_hu_scans_512', prepared_scans)

In [None]:
base_path = "/content/drive/MyDrive/data/val"
train_path = base_path + "/normal"

In [None]:
path = []
train_data = []
scan = []
for filenames in sorted(os.listdir(train_path)):
  if filenames!= ".DS_Store":
    path.append(train_path + "/" + filenames)

In [None]:
def get_window_value(feature):
    if type(feature) == pydicom.multival.MultiValue:
        return np.int(feature[0])
    else:
        return np.int(feature)

pixelspacing_r = []
pixelspacing_c = []
slice_thicknesses = []
patient_id = []
patient_pth = []
row_values = []
column_values = []
window_widths = []
window_levels = []
name = []
slice_number = []



for patient in range(5):
    patient_id.append(patient+1)
    pathx = path[patient]
    example_dcm = listdir(pathx)[0]
    patient_pth.append(pathx)
    dataset = pydicom.dcmread(pathx + "/" + example_dcm)
    
    window_widths.append(get_window_value(dataset.WindowWidth))
    window_levels.append(get_window_value(dataset.WindowCenter))
    
    spacing = dataset.PixelSpacing
    slice_thicknesses.append(dataset.SliceThickness)
    
    name.append(dataset.PatientName)
    
    
    row_values.append(dataset.Rows)
    column_values.append(dataset.Columns)
    pixelspacing_r.append(spacing[0])
    pixelspacing_c.append(spacing[1])
    
scan_properties = pd.DataFrame(data=patient_id, columns=["patient"])
scan_properties.loc[:, "rows"] = row_values
scan_properties.loc[:, "columns"] = column_values
scan_properties.loc[:, "area"] = scan_properties["rows"] * scan_properties["columns"]
scan_properties.loc[:, "pixelspacing_r"] = pixelspacing_r
scan_properties.loc[:, "pixelspacing_c"] = pixelspacing_c
scan_properties.loc[:, "pixelspacing_area"] = scan_properties.pixelspacing_r * scan_properties.pixelspacing_c
scan_properties.loc[:, "slice_thickness"] = slice_thicknesses
scan_properties.loc[:, "patient_pth"] = patient_pth
scan_properties.loc[:, "window_width"] = window_widths
scan_properties.loc[:, "window_level"] = window_levels
scan_properties.loc[:, "name"] = name
scan_properties.head()
scan_properties.to_csv("val_normal_512.csv")

In [None]:
def resize_scan(scan, new_shape):
    # read slice as 32 bit signed integers
    img = Image.fromarray(scan, mode="I")
    # do the resizing
    img = img.resize(new_shape, resample=Image.LANCZOS)
    # convert back to 16 bit integers
    resized_scan = np.array(img, dtype=np.int16)
    return resized_scan

In [None]:
output_dir = "/content/drive/MyDrive/val_normal_512"
mkdir(output_dir)
my_shape = (512,512)
preprocess_to_hu_scans(scan_properties, my_shape, output_dir)

0 386
1 487
2 237
3 485
4 249
