## feature extraction

brain 폴더 안에 각 환자별로 brainimage가 있고, mask 폴더 안에 각 환자별로 mask가 있다. \
brain image m개와 mask n개를 조합하여 각 brain_image-mask쌍에 대해 feature extraction하기 => itertools의 product이용

feature를 extract하여 각 환자가 row에 각 feature가 column에 오도록 csv파일로 뽑아내기

In [7]:
!pip install SimpleITK
!pip install pyradiomics



In [1]:
from glob import glob
import SimpleITK as sitk
import os
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from shutil import move
from itertools import product
import pandas as pd

In [2]:
in_path = '/media/sc-mlsg/DATA2/Perfusion_study_radiomics_preprocessed/05_Stripped_znormalized/'
outpath = '/media/sc-mlsg/DATA2/Perfusion_study_radiomics_preprocessed/'
maskDir = '/media/sc-mlsg/DATA2/Perfusion_study_radiomics_preprocessed/00_Tumor_masks_resampled/'

In [3]:
for (path, dirs, files) in os.walk(in_path):
    print(path, dirs, files)

/media/sc-mlsg/DATA2/Perfusion_study_radiomics_preprocessed/05_Stripped_znormalized/ ['2034571', '2803895', '2826490', '2917442', '3387356', '3441152', '3579055', '3621229', '4199579', '4331755', '4413737', '5096558', '5588560', '5792723', '5843358', '6291624', '6645678', '6659190', '6673952', '6715199', '6776630', '6847146', '7471985', '7522152', '7547026', '7553272', '7644780', '7707348', '7711873', '7716992', '7741219', '7746738', '7753977', '7760652', '7767219', '7777703', '7781243', '7783083', '7808217', '7809569', '7816559', '7820534', '7826770', '7842502', '7845251', '7865431', '7871704', '7892450', '7898233', '7921035', '7922825', '7926232', '7938576', '7946503', '7947384', '7955082', '7968300', '7971571', '7973273', '8002014', '8011358', '8011880', '8018918', '8044835', '8051568', '8052811', '8060512', '8064434', '8072260', '8074420', '8077298', '8078358', '8082756', '8083396', '8089351', '8097929', '8101370', '8125034', '8170673', '8190372', '8224960', '8229739', '8235696', '

In [4]:
from __future__ import print_function
from radiomics import featureextractor
import radiomics
import logging
radiomics.logger.setLevel(logging.ERROR)
import SimpleITK as sitk
from radiomics import firstorder, glcm, shape, glrlm, glszm, ngtdm, gldm
import numpy as np


def Numpy2Itk(array):
    '''
    :param array: numpy array format
    :return: simple itk image type format
    '''
    return sitk.GetImageFromArray(array)

def feature_extract(image_origin, image_mask, features = ['firstorder', 'glcm', 'glszm', 'glrlm', 'ngtdm', 'shape'], binWidth=32, binCount=None):
    '''
    :param image_origin: image_array (numpy array)
    :param image_mask: mask_array (numpy array)
    :return: whole features, featureVector
    '''
    image = Numpy2Itk(image_origin)
    mask = Numpy2Itk(image_mask)
    
    settings = {}

    if binWidth:
        settings['binWidth'] = binWidth
    if binCount:
        settings['binCount'] = binCount
    settings['resampledPixelSpacing'] = (1,1,1)
    settings['interpolator'] = 'sitkBSpline'
    settings['verbose'] = True
    
    extractor = featureextractor.RadiomicsFeatureExtractor(**settings)
    extractor.settings['enableCExtensions'] = True
    
    for feature in features:
        extractor.enableFeatureClassByName(feature.lower())
        
    featureVector = extractor.execute(image, mask)
    
    cols = []; feats = []
    for feature in features:
        for featureName in sorted(featureVector.keys()):
            if feature in featureName:
                cols.append(featureName)
                feats.append(featureVector[featureName])
    return feats, cols

In [6]:
error_dict = {}
for size in [8, 32, 128]:
    feat_df_width = pd.DataFrame()
    feat_df_count = pd.DataFrame()
    total = 0
    for (path, dirs, files) in os.walk(in_path):
        if files:
     #       continue
      #  try:
            total += 1
            pt = path.split(os.path.sep)[-1]
            print(total, pt)
            brain_paths = glob(os.path.join(path, '*'))
            mask_paths = glob(os.path.join(maskDir, pt, '*'))

            bw_features_lst, bw_cols_lst, bc_features_lst, bc_cols_lst = [], [], [], []

            for (img_path, mask_path) in product(brain_paths, mask_paths):
                seq = img_path.split(os.path.sep)[-1].split("_")[1]
                mask_type = mask_path.split(os.path.sep)[-1].split("_")[0]

                brain_img, mask = [sitk.ReadImage(path) for path in [img_path, mask_path]]
                brain_arr, mask_arr = [sitk.GetArrayFromImage(img) for img in [brain_img, mask]]

                # Scale
                brain_arr = brain_arr * 100

                # Change binWidth
                bw_features, bw_cols = feature_extract(brain_arr, mask_arr, binWidth=size)
                bw_cols = [col.replace('original', '{}'.format(seq)) for col in bw_cols]
#                bw_cols = [col.replace('original', '{}_image_{}_mask'.format(seq, mask_type)) for col in bw_cols]

                # Change binCount            
                bc_features, bc_cols = feature_extract(brain_arr, mask_arr, binWidth=None, binCount=size)
                bc_cols = [col.replace('original', '{}'.format(seq)) for col in bc_cols]
#                bc_cols = [col.replace('original', '{}_image_{}_mask'.format(seq, mask_type)) for col in bc_cols]

                bw_features_lst += bw_features
                bw_cols_lst += bw_cols
                bc_features_lst += bc_features
                bc_cols_lst += bc_cols

            feat_df_width = pd.concat([feat_df_width, pd.DataFrame(np.array(bw_features_lst),index=bw_cols_lst, columns=[pt]).T])
            feat_df_count = pd.concat([feat_df_count, pd.DataFrame(np.array(bc_features_lst), index=bc_cols_lst, columns=[pt]).T])

#        except Exception as e:
#            print("error", e, path)
#            error_dict[path] = e
            
    feat_df_width.to_csv(os.path.join(outpath, 'Perfusion_study_radiomics_bin_width_%s_scale_100.csv'%size))
    feat_df_count.to_csv(os.path.join(outpath, 'Perfusion_study_radiomics_bin_count_%s_scale_100.csv'%size))
    print("Finished %d size" % size)

1 2034571


MemoryError: 

In [8]:
error_lst = pd.read_csv("/media/sc-mlsg/Samsung_T5/error_file.csv")["Unnamed: 0"].to_list()
error_lst

['/media/user/Samsung_T5/Stripped_znormed_421pts_images/10012748',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/10067721',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/10100220',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/1952345',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/3928044',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/8303154',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/8581917',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/8807253',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/6509113',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/7697438',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/8103623',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/8106159',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/8120969',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/8177745',
 '/media/user/Samsung_T5/Stripped_znormed_421pts_images/8256945',
 '/medi