# Radiomic Feature Post-processing 

In [None]:
# Intention Hassan transform: Render feature independent of gray level discretization.

In [84]:
import sys
sys.path.append('./../utils/')

import os
import re

import numpy as np
import pandas as pd

import ioutil
import transforms

from sklearn.preprocessing import StandardScaler

In [85]:
# Load Hassan transformation functions.
hassan_transf = transforms.hassan_transforms(None)

In [86]:
# Bin widths used in gray level discretization.
ct_bin_widths = {
    '30': 114.71, 
    '50': 68.82,
    '70': 49.16,
    '90': 38.24,
    '110': 31.28,
    '130': 26.47
}
pet_bin_widths = {
    '30': 0.47,
    '50': 0.28,
    '70': 0.2,
    '90': 0.155,
    '110': 0.13,
    '130': 0.10
}

In [87]:
# Globals
THRESH = 1e-8
FILTER_TYPE = 'original'

In [88]:
# References to disk locations.
raw_source = './../../../data_source/radiomic_features/no_filter/'
prep_source = './../../../data_source/to_analysis/no_filter/'

In [98]:
error_dir = './../../../data_source/radiomic_features/no_filter/errors/'

# Individual file names.
ct_labels = [
    'ct_no_filter30.csv',
    'ct_no_filter50.csv',
    'ct_no_filter70.csv',
    'ct_no_filter90.csv',
    'ct_no_filter110.csv',
    'ct_no_filter130.csv'
]
pet_labels = [
    'pet_no_filter30.csv',
    'pet_no_filter50.csv',
    'pet_no_filter70.csv',
    'pet_no_filter90.csv',
    'pet_no_filter110.csv',
    'pet_no_filter130.csv'
]

In [89]:
true_index = np.load('./../../../data_source/patient_id.npy')

## Shape Features

In [90]:
# Handling shape features.
ct_shape = pd.read_csv('./../../../data_source/radiomic_features/shape/ct_shape.csv', index_col=0)
pet_shape = pd.read_csv('./../../../data_source/radiomic_features/shape/pet_shape.csv', index_col=0)

ct_shape.shape, pet_shape.shape

((198, 18), (198, 18))

In [96]:
ct_shape = ct_shape[ct_shape.filter(regex='original').columns]
pet_shape = pet_shape[pet_shape.filter(regex='original').columns]

# Sanity check.
assert np.array_equal(ct_shape.values, pet_shape.values)

ct_shape.index = true_index

shape_feats.shape

(198, 13)

In [93]:
shape_feats = pd.read_csv('./../../../data_source/to_analysis/radiomics_shape.csv', index_col=0)
shape_feats.head()

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxis,original_shape_MajorAxis,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MinorAxis,original_shape_Sphericity,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_Volume
2,0.738882,0.723925,27.060529,37.380273,41.976184,44.598206,42.720019,45.617979,27.619612,0.661532,4860.055715,0.283517,17142.0
4,0.7969,0.629917,19.845151,31.504408,38.587563,35.468296,29.410882,38.704005,25.105855,0.701721,3126.087371,0.323578,9661.0
5,0.600926,0.53514,22.515072,42.073251,46.065171,43.011626,32.015621,46.454279,25.282894,0.762365,4063.633046,0.250625,16214.0
8,0.784571,0.414247,30.263897,73.057649,74.1485,80.956779,65.764732,83.4386,57.318945,0.520001,15698.615155,0.226355,69354.0
10,0.69032,0.539743,19.449801,36.035312,33.286634,38.013156,33.015148,43.150898,24.875896,0.643822,3658.448414,0.340352,10749.0


In [95]:
# Write to disk.
ct_shape.to_csv('./../../../data_source/to_analysis/radiomics_shape.csv')

In [99]:
def get_file_name(path_to_file):
    name, _ = path_to_file.split('.')
    return name

In [66]:
# Filter out redundant columns.
ct_data, pet_data = [], []
for ct_label, pet_label in zip(ct_labels, pet_labels):
    
    ct_path = os.path.join(raw_source, ct_label)
    pet_path = os.path.join(raw_source, pet_label)

    ct_data.append(
        pd.read_csv(ct_path, index_col=3).filter(regex=target_class)
    )
    pet_data.append(
        pd.read_csv(pet_path, index_col=3).filter(regex=target_class)
    )

In [67]:
# Sanity check on number of extracted features.
for ct_dset, pet_dset in zip(ct_data, pet_data):
    print(np.shape(ct_dset), np.shape(pet_dset))

(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)


In [68]:
# Sanity check on indexing.
for ct_dset, pet_dset in zip(ct_data, pet_data):
    print(np.array_equal(ct_dset.index.values, true_index)) 
    print(np.array_equal(pet_dset.index.values, true_index))

True
True
True
True
True
True
True
True
True
True
True
True


## CT Hassan Transforms

In [None]:
Nv = shape_feats['original_shape_Volume'].values

In [None]:
# Replace the former CT featue deffinitions with the Hassan modified versions.
for key, transf in hassan_transf.items():
    if key in ct_miss:
        pass
    else:
        target_feat = ct_set.filter(regex=key).columns.values
        # TEMP: Hack to remove add GLNUM when searching for GLNU.
        if len(target_feat) > 1:
            target_feat = target_feat[0]

        if len(target_feat) == 1:
            data = np.squeeze(ct_set[target_feat].values)
            if 'firstorder_Entropy' in target_feat[0]:
                ct_set.loc[:, target_feat] = transf(data, Nv)[:, np.newaxis]
            else:
                Ng = int(list(ct_bin_widths.keys())[num])
                ct_set.loc[:, target_feat] = transf(data, Ng)[:, np.newaxis]
                        

In [73]:
# Save data to disk.
for num, (ct_set, pet_set) in enumerate(zip(ct_data, pet_data)):
    
    # Drop redundant columns.
    ct_redundant = ct_set.columns[ct_set.var() == 0.0].values
    pet_redundant = pet_set.columns[pet_set.var() == 0.0].values
    
    if len(ct_redundant) > 0:
        if verbose > 0:
            print('CT redundant: ', ct_redundant)

        ct_red_path = os.path.join(error_dir, 'redundant_{}'.format(ct_labels[num]))
        pd.Series(ct_redundant).to_csv(ct_red_path)
        ct_set.drop(ct_redundant, axis=1, inplace=True)
        
    if len(pet_redundant) > 0:        
        if verbose > 0:
            print('PET redundant: ', pet_redundant)
            
        pet_red_path = os.path.join(error_dir, 'redundant_{}'.format(pet_labels[num]))
        pd.Series(pet_redundant).to_csv(pet_red_path)
        pet_set.drop(pet_redundant, axis=1, inplace=True)
    
    # Drop columns with missing values.
    ct_miss = ct_set.columns[ct_set.isnull().any()].values
    pet_miss = pet_set.columns[pet_set.isnull().any()].values
        
    if len(ct_miss) > 0:
        if verbose > 0:
            print('CT missing: ', ct_miss)
        
        ct_miss_path = os.path.join(error_dir, 'missing_{}'.format(ct_labels[num]))
        pd.Series(ct_miss).to_csv(ct_miss_path)
        ct_set.drop(ct_miss, axis=1, inplace=True)
        
    if len(pet_miss) > 0:
        if verbose > 0:
            print('PET missing: ', pet_miss)
        
        pet_miss_path = os.path.join(error_dir, 'missing_{}'.format(pet_labels[num]))
        pd.Series(pet_miss).to_csv(ct_miss_path)
        pet_set.drop(pet_miss, axis=1, inplace=True)

CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']
CT missing:  ['original_ngtdm_Contrast']


In [None]:
def get_discr_level(path_to_file):
    return re.findall('\d+', path_to_file)[0]


# Save data to disk.
for num, (ct_set, pet_set) in enumerate(zip(ct_data, pet_data)):
    # Update column labels.
    ct_set.columns = [
        'CT{}_{}'.format(get_discr_level(ct_labels[num]), label) 
        for label in ct_set.columns
    ]
    pet_set.columns = [
        'PET{}_{}'.format(get_discr_level(pet_labels[num]), label) 
        for label in pet_set.columns
    ]
    else:
        ct_set.to_csv(os.path.join(prep_source, ct_labels[num]))
        pet_set.to_csv(os.path.join(prep_source, pet_labels[num]))

## Combine Feature Sets

In [75]:
ct_concat = pd.concat(ct_data, axis=1)
pet_concat = pd.concat(pet_data, axis=1)

ct_concat.shape, pet_concat.shape

((198, 546), (198, 552))

In [54]:
# Save to disk.
#ct_concat.to_csv(path_ct_concat)
#pet_concat.to_csv(path_pet_concat)

In [77]:
shape_feats = pd.read_csv('./../../../data_source/to_analysis/radiomics_shape.csv', index_col=0)
shape_feats.head()

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxis,original_shape_MajorAxis,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MinorAxis,original_shape_Sphericity,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_Volume
2,0.738882,0.723925,27.060529,37.380273,41.976184,44.598206,42.720019,45.617979,27.619612,0.661532,4860.055715,0.283517,17142.0
4,0.7969,0.629917,19.845151,31.504408,38.587563,35.468296,29.410882,38.704005,25.105855,0.701721,3126.087371,0.323578,9661.0
5,0.600926,0.53514,22.515072,42.073251,46.065171,43.011626,32.015621,46.454279,25.282894,0.762365,4063.633046,0.250625,16214.0
8,0.784571,0.414247,30.263897,73.057649,74.1485,80.956779,65.764732,83.4386,57.318945,0.520001,15698.615155,0.226355,69354.0
10,0.69032,0.539743,19.449801,36.035312,33.286634,38.013156,33.015148,43.150898,24.875896,0.643822,3658.448414,0.340352,10749.0


In [78]:
clinical = pd.read_csv('./../../../data_source/to_analysis/clinical_params.csv', index_col=0)
clinical.head()

Unnamed: 0_level_0,Age,Years Smoking,Naxogin Days,Sex_M,ICD-10_C02,ICD-10_C03,ICD-10_C04,ICD-10_C05,ICD-10_C06,ICD-10_C09,...,Cisplatin_1,Cisplatin_2,Cisplatin_3,Cisplatin_4,Cisplatin_5,Cisplatin_6,Stage 1,Stage 2,Stage 3,Stage 4
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,60.832877,0.0,39.0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,49.906849,4.786027,33.0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
5,54.238356,0.0,42.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
8,54.687671,18.343836,0.0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
10,61.728767,0.0,35.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [76]:
pet_params = pd.read_csv('./../../../data_source/to_analysis/pet_params.csv', index_col=0)
pet_params.head()

Unnamed: 0_level_0,SUVpeak,MTV,TLG
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,21.616549,7.384,124.870726
4,15.296275,3.406,41.554406
5,14.473272,7.934,86.22842
8,10.510859,26.926,205.413389
10,7.21319,6.041,32.10377


In [81]:
ct_concat.head()

Unnamed: 0_level_0,CT30_original_firstorder_10Percentile,CT30_original_firstorder_90Percentile,CT30_original_firstorder_Energy,CT30_original_firstorder_Entropy,CT30_original_firstorder_InterquartileRange,CT30_original_firstorder_Kurtosis,CT30_original_firstorder_Maximum,CT30_original_firstorder_MeanAbsoluteDeviation,CT30_original_firstorder_Mean,CT30_original_firstorder_Median,...,CT130_original_gldm_LargeDependenceHighGrayLevelEmphasis,CT130_original_gldm_LargeDependenceLowGrayLevelEmphasis,CT130_original_gldm_LowGrayLevelEmphasis,CT130_original_gldm_SmallDependenceEmphasis,CT130_original_gldm_SmallDependenceHighGrayLevelEmphasis,CT130_original_gldm_SmallDependenceLowGrayLevelEmphasis,CT130_original_ngtdm_Busyness,CT130_original_ngtdm_Coarseness,CT130_original_ngtdm_Complexity,CT130_original_ngtdm_Strength
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.097696,0.221526,22205540000.0,0.183969,0.056668,59.779383,1.025223,0.049673,0.15111,0.158412,...,2402.945339,152.423157,0.270957,0.003096,0.008828,0.001663,52.686,0.004953,0.023557,0.004306
4,-0.31512,0.171638,12510280000.0,0.990405,0.182336,8.933352,0.705242,0.152791,-0.051784,-0.014157,...,999.433288,289.314253,0.668202,0.005037,0.012243,0.003236,1350.718517,0.001131,0.182981,0.001133
5,-0.084255,0.278018,21001470000.0,0.627742,0.11094,14.831983,1.713988,0.118381,0.098319,0.107412,...,2053.242136,181.769644,0.367954,0.003778,0.009121,0.002442,219.449851,0.001491,0.082733,0.001439
8,0.02216,0.187135,89831270000.0,0.415386,0.064696,55.850257,1.476205,0.091661,0.093632,0.138433,...,2302.861594,164.953849,0.312841,0.003261,0.008661,0.001911,555.455131,0.000515,0.056019,0.000487
10,-0.180616,-0.068998,13917110000.0,0.063316,0.044584,46.402736,0.59851,0.063094,-0.135706,-0.107931,...,597.001861,594.547214,0.994418,0.002294,0.002973,0.002124,19.483858,0.026248,0.007089,0.024128


In [82]:
pet_concat.head()

Unnamed: 0_level_0,PET30_original_firstorder_10Percentile,PET30_original_firstorder_90Percentile,PET30_original_firstorder_Energy,PET30_original_firstorder_Entropy,PET30_original_firstorder_InterquartileRange,PET30_original_firstorder_Kurtosis,PET30_original_firstorder_Maximum,PET30_original_firstorder_MeanAbsoluteDeviation,PET30_original_firstorder_Mean,PET30_original_firstorder_Median,...,PET130_original_gldm_LargeDependenceLowGrayLevelEmphasis,PET130_original_gldm_LowGrayLevelEmphasis,PET130_original_gldm_SmallDependenceEmphasis,PET130_original_gldm_SmallDependenceHighGrayLevelEmphasis,PET130_original_gldm_SmallDependenceLowGrayLevelEmphasis,PET130_original_ngtdm_Busyness,PET130_original_ngtdm_Coarseness,PET130_original_ngtdm_Complexity,PET130_original_ngtdm_Contrast,PET130_original_ngtdm_Strength
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.351937,3.0,458556.5,2.354821,2.023307,1.79767,3.0,1.008945,2.051788,2.835852,...,0.902999,0.009915,0.097207,48.282946,0.00068,1.115818,0.000807,294.886685,0.123679,1.641894
4,0.033082,3.0,221526.9,2.576549,2.633144,1.301798,3.0,1.164925,1.623906,1.577296,...,0.991736,0.011562,0.110287,53.940093,0.00062,0.655065,0.001705,341.035907,0.153371,2.825279
5,0.468723,3.0,459986.9,2.230801,1.583687,2.458731,3.0,0.900359,2.225005,3.0,...,0.913507,0.009036,0.102838,50.652811,0.000613,0.922104,0.00086,285.052486,0.104444,1.864715
8,-0.015165,3.0,1509075.0,2.897816,2.389781,1.542488,3.0,1.025935,1.520806,1.451224,...,1.005031,0.009364,0.084595,44.768908,0.000393,2.787597,0.000505,259.730843,0.099641,0.460083
10,0.42693,3.0,251117.1,2.840108,1.823018,1.770913,3.0,0.855992,1.733723,1.703617,...,0.449098,0.007895,0.110149,53.518013,0.000585,0.494405,0.002808,315.993654,0.099745,1.937278


In [80]:
X = pd.concat(
    (clinical, shape_feats, pet_params, ct_concat, pet_concat), 
    axis=1
)
X.shape

(198, 1156)

In [83]:
X.to_csv('./../../../data_source/to_analysis/no_filter_complete.csv')

In [None]:
path_ct_concat = './../../../data_source/to_analysis/ct_no_filter_concat.csv'
path_pet_concat = './../../../data_source/to_analysis/pet_no_filter_concat.csv'