# Radiomic Feature Post-processing 

In [1]:
import sys
sys.path.append('./../utils/')

import os
import re

import numpy as np
import pandas as pd

import ioutil

from sklearn.preprocessing import StandardScaler

In [2]:
hassan_transforms = {
    # Nv: Number of voxels in ROI.
    'firstorder_Entropy': lambda f, Nv: f * np.log(Nv), 
    # Ng: Numer of gray levels in image.
    'glcm_DifferenceEntropy': lambda f, Ng: f / np.log(Ng ** 2),
    'glcm_JointEntropy': lambda f, Ng: f / np.log(Ng ** 2),
    'glcm_SumEntropy': lambda f, Ng: f / np.log(Ng ** 2),
    'glcm_Contrast': lambda f, Ng: f / (Ng ** 2),
    'glcm_DifferenceVariance': lambda f, Ng: f / (Ng ** 2),
    'glcm_SumAverage': lambda f, Ng: f / Ng,
    'glcm_DifferenceAverage': lambda f, Ng: f / Ng, 
    # Ng: Numer of gray levels in image.
    'glrlm_GrayLevelNonUniformity': lambda f, Ng: f * Ng,
    'glrlm_HighGrayLevelRunEmphasis': lambda f, Ng: f / (Ng ** 2),
    'glrlm_ShortRunHighGrayLevelEmphasis': lambda f, Ng: f / (Ng ** 2),
    # Ng: Numer of gray levels in image.
    'ngtdm_Contrast': lambda f, Ng: f / Ng,
    'ngtdm_Complexity': lambda f, Ng: f / (Ng ** 3),
    'ngtdm_Strength': lambda f, Ng: f / (Ng ** 2),
     
}

In [3]:
# Bin widths used in gray level discretization.
ct_bin_widths = {
    '30': 114.71, 
    '50': 68.82,
    '70': 49.16,
    '90': 38.24,
    '110': 31.28,
    '130': 26.47
}
pet_bin_widths = {
    '30': 0.47,
    '50': 0.28,
    '70': 0.2,
    '90': 0.155,
    '110': 0.13,
    '130': 0.10
}

In [4]:
# Globals
THRESH = 1e-8
verbose = 1
hassan_transform = False
FILTER_TYPE = 'original'

# References to disk locations.
raw_source = './../../../data_source/radiomic_features/no_filter/'
prep_source = './../../../data_source/to_analysis/no_filter/'

path_ct_concat = './../../../data_source/to_analysis/ct_no_filter_concat.csv'
path_pet_concat = './../../../data_source/to_analysis/pet_no_filter_concat.csv'

In [5]:
true_index = np.load('./../../../data_source/patient_id.npy')
true_index[:5]

array([ 2,  4,  5,  8, 10])

In [6]:
# Handling shape features.
#ct_shape = pd.read_csv('./../../../data_source/radiomic_features/shape/ct_shape.csv', index_col=0)
#pet_shape = pd.read_csv('./../../../data_source/radiomic_features/shape/pet_shape.csv', index_col=0)

#ct_shape.shape, pet_shape.shape

In [7]:
#ct_shape = ct_shape[ct_shape.filter(regex='original').columns]
#pet_shape = pet_shape[pet_shape.filter(regex='original').columns]

#ct_shape.shape, pet_shape.shape

#np.array_equal(ct_shape.values, pet_shape.values)

#ct_shape.index = true_index

# Write to disk.
#ct_shape.to_csv('./../../../data_source/to_analysis/radiomics_shape.csv')

In [8]:
shape_feats = pd.read_csv('./../../../data_source/to_analysis/radiomics_shape.csv', index_col=0)
shape_feats.head()

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxis,original_shape_MajorAxis,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MinorAxis,original_shape_Sphericity,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_Volume
2,0.738882,0.723925,27.060529,37.380273,41.976184,44.598206,42.720019,45.617979,27.619612,0.661532,4860.055715,0.283517,17142.0
4,0.7969,0.629917,19.845151,31.504408,38.587563,35.468296,29.410882,38.704005,25.105855,0.701721,3126.087371,0.323578,9661.0
5,0.600926,0.53514,22.515072,42.073251,46.065171,43.011626,32.015621,46.454279,25.282894,0.762365,4063.633046,0.250625,16214.0
8,0.784571,0.414247,30.263897,73.057649,74.1485,80.956779,65.764732,83.4386,57.318945,0.520001,15698.615155,0.226355,69354.0
10,0.69032,0.539743,19.449801,36.035312,33.286634,38.013156,33.015148,43.150898,24.875896,0.643822,3658.448414,0.340352,10749.0


In [9]:
shape_feats.columns

Index(['original_shape_Elongation', 'original_shape_Flatness',
       'original_shape_LeastAxis', 'original_shape_MajorAxis',
       'original_shape_Maximum2DDiameterColumn',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_Maximum3DDiameter', 'original_shape_MinorAxis',
       'original_shape_Sphericity', 'original_shape_SurfaceArea',
       'original_shape_SurfaceVolumeRatio', 'original_shape_Volume'],
      dtype='object')

In [10]:
shape_feats.shape

(198, 13)

In [11]:
def get_file_name(path_to_file):
    name, _ = path_to_file.split('.')
    return name


def get_discr_level(path_to_file):
    return re.findall('\d+', path_to_file)[0]

In [12]:
# Target feature category.
target_class = 'original'

error_dir = './../../../data_source/radiomic_features/no_filter/errors/'

# Individual file names.
ct_labels = [
    'ct_no_filter30.csv',
    'ct_no_filter50.csv',
    'ct_no_filter70.csv',
    'ct_no_filter90.csv',
    'ct_no_filter110.csv',
    'ct_no_filter130.csv'
]
pet_labels = [
    'pet_no_filter30.csv',
    'pet_no_filter50.csv',
    'pet_no_filter70.csv',
    'pet_no_filter90.csv',
    'pet_no_filter110.csv',
    'pet_no_filter130.csv'
]

In [13]:
# Filter out redundant columns.
ct_data, pet_data = [], []
for ct_label, pet_label in zip(ct_labels, pet_labels):
    
    ct_path = os.path.join(raw_source, ct_label)
    pet_path = os.path.join(raw_source, pet_label)

    ct_data.append(
        pd.read_csv(ct_path, index_col=3).filter(regex=target_class)
    )
    pet_data.append(
        pd.read_csv(pet_path, index_col=3).filter(regex=target_class)
    )

In [14]:
# Sanity check on number of extracted features.
for ct_dset, pet_dset in zip(ct_data, pet_data):
    print(np.shape(ct_dset), np.shape(pet_dset))

(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)


In [15]:
# Sanity check on shape features.
np.array_equal(ct_data[0].values, pet_data[0].values)

False

In [16]:
# Sanity check on indexing.
for ct_dset, pet_dset in zip(ct_data, pet_data):
    print(np.array_equal(ct_dset.index.values, true_index)) 
    print(np.array_equal(pet_dset.index.values, true_index))

True
True
True
True
True
True
True
True
True
True
True
True


In [17]:
# Check which CT features are affected by varying gray levels.
ct_feats = ct_data[0].columns.values
pet_feats = pet_data[0].columns.values

ct_concat = pd.concat(ct_data, axis=1)
pet_concat = pd.concat(pet_data, axis=1)

np.shape(ct_concat), np.shape(pet_concat)

ct_aff = {}
for ct_feat in ct_feats:
    sample_var = sum(ct_concat.filter(regex=ct_feat).var(axis=1))
    if sample_var > THRESH:
        ct_aff[ct_feat] = sample_var    
        
pet_aff = {}
for pet_feat in pet_feats:
    sample_var = sum(pet_concat.filter(regex=pet_feat).var(axis=1))
    if sample_var > THRESH:
        pet_aff[pet_feat] = sample_var

In [18]:
ct_aff

{'original_firstorder_Mean': 2.8434588974201125,
 'original_glcm_Id': 0.03170531213380771,
 'original_glcm_Idm': 0.0471028680265421,
 'original_glrlm_GrayLevelNonUniformity': 297672277.9242542,
 'original_glrlm_RunLengthNonUniformity': 7864905.922775112,
 'original_glszm_GrayLevelNonUniformity': 21257.570027487858,
 'original_glszm_SizeZoneNonUniformity': 848.3131422365024,
 'original_gldm_DependenceNonUniformity': 37279309154.86914}

In [19]:
pet_aff

{'original_firstorder_Entropy': 56.82753748170258,
 'original_firstorder_Mean': 96.4146715750742,
 'original_firstorder_Uniformity': 0.33373759320493135,
 'original_glcm_Autocorrelation': 11422132.521787163,
 'original_glcm_ClusterProminence': 2054238143588.8792,
 'original_glcm_ClusterShade': 546294328.7873932,
 'original_glcm_ClusterTendency': 2733240.7672368553,
 'original_glcm_Contrast': 1417.7355479008154,
 'original_glcm_Correlation': 0.008335539213400703,
 'original_glcm_DifferenceAverage': 42.46276857030805,
 'original_glcm_DifferenceEntropy': 43.57517833842609,
 'original_glcm_DifferenceVariance': 537.916550203315,
 'original_glcm_Id': 4.4598762084188595,
 'original_glcm_Idm': 5.759670903720736,
 'original_glcm_Idmn': 3.1267298062091885e-05,
 'original_glcm_Idn': 0.0008525450625065627,
 'original_glcm_Imc1': 0.4030534210879458,
 'original_glcm_Imc2': 0.02274239628271493,
 'original_glcm_InverseVariance': 0.329584096969585,
 'original_glcm_JointAverage': 8777.295889660025,
 'or

In [20]:
Nv = shape_feats['original_shape_Volume'].values

In [21]:
# Save data to disk.
for num, (ct_set, pet_set) in enumerate(zip(ct_data, pet_data)):
    
    # Drop redundant columns.
    ct_redundant = ct_set.columns[ct_set.var() == 0.0].values
    pet_redundant = pet_set.columns[pet_set.var() == 0.0].values
    
    if len(ct_redundant) > 0:
        if verbose > 0:
            print('CT redundant: ', ct_redundant)

        ct_red_path = os.path.join(error_dir, 'redundant_{}'.format(ct_labels[num]))
        pd.Series(ct_redundant).to_csv(ct_red_path)
        ct_set.drop(ct_redundant, axis=1, inplace=True)
        
    if len(pet_redundant) > 0:        
        if verbose > 0:
            print('PET redundant: ', pet_redundant)
            
        pet_red_path = os.path.join(error_dir, 'redundant_{}'.format(pet_labels[num]))
        pd.Series(pet_redundant).to_csv(pet_red_path)
        pet_set.drop(pet_redundant, axis=1, inplace=True)
    
    # Drop columns with missing values.
    ct_miss = ct_set.columns[ct_set.isnull().any()].values
    pet_miss = pet_set.columns[pet_set.isnull().any()].values
        
    if len(ct_miss) > 0:
        if verbose > 0:
            print('CT missing: ', ct_miss)
        
        ct_miss_path = os.path.join(error_dir, 'missing_{}'.format(ct_labels[num]))
        pd.Series(ct_miss).to_csv(ct_miss_path)
        ct_set.drop(ct_miss, axis=1, inplace=True)
        
    if len(pet_miss) > 0:
        if verbose > 0:
            print('PET missing: ', pet_miss)
        
        pet_miss_path = os.path.join(error_dir, 'missing_{}'.format(pet_labels[num]))
        pd.Series(pet_miss).to_csv(ct_miss_path)
        pet_set.drop(pet_miss, axis=1, inplace=True)
        
    # Replace the former featue deffinition with the modified versions.
    if hassan_transform:
        for key, transf in hassan_transforms.items():
            if key in ct_miss:
                pass
            else:
                target_feat = ct_set.filter(regex=key).columns.values
                # TEMP: Hack to remove add GLNUM when searching for GLNU.
                if len(target_feat) > 1:
                    target_feat = target_feat[0]
                    
                if len(target_feat) == 1:
                    data = np.squeeze(ct_set[target_feat].values)
                    if 'firstorder_Entropy' in target_feat[0]:
                        ct_set.loc[:, target_feat] = transf(data, Nv)[:, np.newaxis]
                    else:
                        Ng = int(list(ct_bin_widths.keys())[num])
                        ct_set.loc[:, target_feat] = transf(data, Ng)[:, np.newaxis]
                        
    # Update column labels.
    ct_set.columns = [
        'CT{}_{}'.format(get_discr_level(ct_labels[num]), label) 
        for label in ct_set.columns
    ]
    pet_set.columns = [
        'PET{}_{}'.format(get_discr_level(pet_labels[num]), label) 
        for label in pet_set.columns
    ]
    # Save data sets to disk.
    if hassan_transform:
        ct_set.to_csv(os.path.join(prep_source, 'hassan_{}'.format(ct_labels[num])))
        pet_set.to_csv(os.path.join(prep_source, 'hassan_{}'.format(pet_labels[num])))
    else:
        ct_set.to_csv(os.path.join(prep_source, ct_labels[num]))
        pet_set.to_csv(os.path.join(prep_source, pet_labels[num]))

CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']


In [22]:
ct_data[0].shape, pet_data[0].shape

((198, 91), (198, 92))

In [23]:
ct_concat = pd.concat(ct_data, axis=1)
pet_concat = pd.concat(pet_data, axis=1)

ct_concat.shape, pet_concat.shape

((198, 546), (198, 552))

In [24]:
# Save to disk.
ct_concat.to_csv(path_ct_concat)
pet_concat.to_csv(path_pet_concat)