# Radiomic Feature Post-processing 

In [1]:
# Intention Hassan transform: Render feature independent of gray level discretization.

In [2]:
import sys
sys.path.append('./../utils/')

import os
import re

import numpy as np
import pandas as pd

import ioutil
import transforms

from sklearn.preprocessing import StandardScaler

In [3]:
# Load Hassan transformation functions.
hassan_transf = transforms.hassan_transforms(None)

In [44]:
# Globals
THRESH = 1e-8
FILTER_TYPE = 'sigma'

In [45]:
# References to disk locations.
raw_source = './../../../data_source/radiomic_features/gauss/'
prep_source = './../../../data_source/to_analysis/gauss/'
error_dir = './../../../data_source/radiomic_features/gauss/errors'
# Individual file names.
ct_labels = [
    'ct_05gauss_128.csv'
]
pet_labels = [
    'pet_05gauss_16.csv'
]

In [46]:
true_index = np.load('./../../../data_source/patient_id.npy')

## Shape Features

In [47]:
shape_feats = pd.read_csv('./../../../data_source/to_analysis/radiomics_shape.csv', index_col=0)

In [48]:
def get_file_name(path_to_file):
    name, _ = path_to_file.split('.')
    return name

In [49]:
# Filter out redundant columns.
ct_data, pet_data = [], []
for ct_label, pet_label in zip(ct_labels, pet_labels):
    
    ct_path = os.path.join(raw_source, ct_label)
    pet_path = os.path.join(raw_source, pet_label)

    ct_data.append(
        pd.read_csv(ct_path, index_col=3).filter(regex=FILTER_TYPE)
    )
    pet_data.append(
        pd.read_csv(pet_path, index_col=3).filter(regex=FILTER_TYPE)
    )

In [50]:
# Sanity check on number of extracted features.
for ct_dset, pet_dset in zip(ct_data, pet_data):
    print(np.shape(ct_dset), np.shape(pet_dset))

(198, 92) (198, 92)


In [51]:
# Sanity check on indexing.
for ct_dset, pet_dset in zip(ct_data, pet_data):
    print(np.array_equal(ct_dset.index.values, true_index)) 
    print(np.array_equal(pet_dset.index.values, true_index))

True
True


In [52]:
# Save data to disk.
for num, (ct_set, pet_set) in enumerate(zip(ct_data, pet_data)):
    
    # Drop redundant columns.
    ct_redundant = ct_set.columns[ct_set.var() == 0.0].values
    pet_redundant = pet_set.columns[pet_set.var() == 0.0].values
    
    if len(ct_redundant) > 0:
        if verbose > 0:
            print('CT redundant: ', ct_redundant)

        ct_red_path = os.path.join(error_dir, 'redundant_{}'.format(ct_labels[num]))
        pd.Series(ct_redundant).to_csv(ct_red_path)
        ct_set.drop(ct_redundant, axis=1, inplace=True)
        
    if len(pet_redundant) > 0:        
        if verbose > 0:
            print('PET redundant: ', pet_redundant)
            
        pet_red_path = os.path.join(error_dir, 'redundant_{}'.format(pet_labels[num]))
        pd.Series(pet_redundant).to_csv(pet_red_path)
        pet_set.drop(pet_redundant, axis=1, inplace=True)
    
    # Drop columns with missing values.
    ct_miss = ct_set.columns[ct_set.isnull().any()].values
    pet_miss = pet_set.columns[pet_set.isnull().any()].values
        
    if len(ct_miss) > 0:
        if verbose > 0:
            print('CT missing: ', ct_miss)
        
        ct_miss_path = os.path.join(error_dir, 'missing_{}'.format(ct_labels[num]))
        pd.Series(ct_miss).to_csv(ct_miss_path)
        ct_set.drop(ct_miss, axis=1, inplace=True)
        
    if len(pet_miss) > 0:
        if verbose > 0:
            print('PET missing: ', pet_miss)
        
        pet_miss_path = os.path.join(error_dir, 'missing_{}'.format(pet_labels[num]))
        pd.Series(pet_miss).to_csv(ct_miss_path)
        pet_set.drop(pet_miss, axis=1, inplace=True)

In [53]:
ct_set.head()

Unnamed: 0_level_0,log-sigma-0-5-mm-3D_firstorder_10Percentile,log-sigma-0-5-mm-3D_firstorder_90Percentile,log-sigma-0-5-mm-3D_firstorder_Energy,log-sigma-0-5-mm-3D_firstorder_Entropy,log-sigma-0-5-mm-3D_firstorder_InterquartileRange,log-sigma-0-5-mm-3D_firstorder_Kurtosis,log-sigma-0-5-mm-3D_firstorder_Maximum,log-sigma-0-5-mm-3D_firstorder_MeanAbsoluteDeviation,log-sigma-0-5-mm-3D_firstorder_Mean,log-sigma-0-5-mm-3D_firstorder_Median,...,log-sigma-0-5-mm-3D_gldm_LargeDependenceLowGrayLevelEmphasis,log-sigma-0-5-mm-3D_gldm_LowGrayLevelEmphasis,log-sigma-0-5-mm-3D_gldm_SmallDependenceEmphasis,log-sigma-0-5-mm-3D_gldm_SmallDependenceHighGrayLevelEmphasis,log-sigma-0-5-mm-3D_gldm_SmallDependenceLowGrayLevelEmphasis,log-sigma-0-5-mm-3D_ngtdm_Busyness,log-sigma-0-5-mm-3D_ngtdm_Coarseness,log-sigma-0-5-mm-3D_ngtdm_Complexity,log-sigma-0-5-mm-3D_ngtdm_Contrast,log-sigma-0-5-mm-3D_ngtdm_Strength
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-10.867335,6.713109,3137016.0,3.917042,7.852367,21.217156,143.99588,7.286667,-2.567012,-0.629916,...,0.006026,0.000339,0.210894,770.157925,0.00013,0.219461,0.000435,10065.103269,0.017962,4.597939
4,-25.554184,13.654629,4276549.0,4.703048,16.711516,12.21194,184.351196,13.346134,-4.280451,-1.735418,...,0.004974,0.00046,0.300616,941.230457,0.000236,0.116814,0.000935,15701.707571,0.041703,6.552782
5,-20.930377,13.019361,6717461.0,3.955658,12.817359,22.296222,163.418579,11.94266,-3.354531,-0.593964,...,0.004353,0.000206,0.193397,1326.986154,9.3e-05,0.172558,0.000507,7332.547865,0.01987,3.86293
8,-11.870222,5.952274,21429670.0,3.724265,7.178367,29.13214,218.409042,8.178796,-2.373754,-1.02042,...,0.018435,0.000617,0.166599,363.83579,0.000169,0.852706,0.000111,17062.359872,0.011276,2.991231
10,-17.778821,5.646324,3738893.0,3.903639,7.508914,17.384516,157.152328,9.810979,-4.877076,-1.080263,...,0.007251,0.000407,0.209395,752.473391,0.000203,0.136476,0.000691,10752.234328,0.02817,7.918559


In [54]:
pet_set.head()

Unnamed: 0_level_0,log-sigma-0-5-mm-3D_firstorder_10Percentile,log-sigma-0-5-mm-3D_firstorder_90Percentile,log-sigma-0-5-mm-3D_firstorder_Energy,log-sigma-0-5-mm-3D_firstorder_Entropy,log-sigma-0-5-mm-3D_firstorder_InterquartileRange,log-sigma-0-5-mm-3D_firstorder_Kurtosis,log-sigma-0-5-mm-3D_firstorder_Maximum,log-sigma-0-5-mm-3D_firstorder_MeanAbsoluteDeviation,log-sigma-0-5-mm-3D_firstorder_Mean,log-sigma-0-5-mm-3D_firstorder_Median,...,log-sigma-0-5-mm-3D_gldm_LargeDependenceLowGrayLevelEmphasis,log-sigma-0-5-mm-3D_gldm_LowGrayLevelEmphasis,log-sigma-0-5-mm-3D_gldm_SmallDependenceEmphasis,log-sigma-0-5-mm-3D_gldm_SmallDependenceHighGrayLevelEmphasis,log-sigma-0-5-mm-3D_gldm_SmallDependenceLowGrayLevelEmphasis,log-sigma-0-5-mm-3D_ngtdm_Busyness,log-sigma-0-5-mm-3D_ngtdm_Coarseness,log-sigma-0-5-mm-3D_ngtdm_Complexity,log-sigma-0-5-mm-3D_ngtdm_Contrast,log-sigma-0-5-mm-3D_ngtdm_Strength
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.270766,0.136896,563.146209,2.664327,0.186781,5.588601,0.545189,0.129748,-0.044361,-0.010923,...,0.90449,0.009436,0.037081,3.974749,0.000709,7.003705,0.000516,98.77597,0.015018,0.083278
4,-0.21894,0.092327,193.046578,2.562333,0.113566,7.028793,0.29017,0.097107,-0.032394,0.001884,...,0.892657,0.007685,0.037656,4.619659,0.000573,3.588207,0.00087,102.951453,0.017162,0.187017
5,-0.159781,0.058853,155.739985,2.802598,0.107254,4.469384,0.258431,0.069037,-0.038243,-0.023795,...,0.824312,0.010241,0.039801,3.895226,0.000638,7.476947,0.000517,106.243797,0.019085,0.077944
8,-0.101083,0.059834,389.882734,2.298582,0.067569,8.013142,0.267151,0.051535,-0.012471,1.6e-05,...,1.157896,0.008249,0.027284,3.13838,0.000388,27.122302,0.000119,86.134552,0.008275,0.025806
10,-0.094853,0.034457,39.139103,2.479768,0.059845,6.403073,0.144485,0.041363,-0.022057,-0.011239,...,0.743133,0.007387,0.038802,5.139316,0.00046,4.229215,0.000744,102.92669,0.011452,0.163202


In [55]:
def get_discr_level(path_to_file):
    return re.findall('\d+', path_to_file)[0]


# Save data to disk.
for num, (ct_set, pet_set) in enumerate(zip(ct_data, pet_data)):
    # Update column labels.
    ct_set.columns = [
        'CT{}_{}'.format(get_discr_level(ct_labels[num]), label) 
        for label in ct_set.columns
    ]
    pet_set.columns = [
        'PET{}_{}'.format(get_discr_level(pet_labels[num]), label) 
        for label in pet_set.columns
    ]

In [56]:
shape_feats = pd.read_csv('./../../../data_source/to_analysis/radiomics_shape.csv', index_col=0)
clinical = pd.read_csv('./../../../data_source/to_analysis/clinical_params.csv', index_col=0)
pet_params = pd.read_csv('./../../../data_source/to_analysis/pet_params.csv', index_col=0)

In [57]:
X = pd.concat(
    (clinical, shape_feats, pet_params, ct_set, pet_set), 
    axis=1
)
X.shape

(198, 242)

In [58]:
X.head()

Unnamed: 0_level_0,Age,Years Smoking,Naxogin Days,Sex_M,ICD-10_C02,ICD-10_C03,ICD-10_C04,ICD-10_C05,ICD-10_C06,ICD-10_C09,...,PET05_log-sigma-0-5-mm-3D_gldm_LargeDependenceLowGrayLevelEmphasis,PET05_log-sigma-0-5-mm-3D_gldm_LowGrayLevelEmphasis,PET05_log-sigma-0-5-mm-3D_gldm_SmallDependenceEmphasis,PET05_log-sigma-0-5-mm-3D_gldm_SmallDependenceHighGrayLevelEmphasis,PET05_log-sigma-0-5-mm-3D_gldm_SmallDependenceLowGrayLevelEmphasis,PET05_log-sigma-0-5-mm-3D_ngtdm_Busyness,PET05_log-sigma-0-5-mm-3D_ngtdm_Coarseness,PET05_log-sigma-0-5-mm-3D_ngtdm_Complexity,PET05_log-sigma-0-5-mm-3D_ngtdm_Contrast,PET05_log-sigma-0-5-mm-3D_ngtdm_Strength
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,60.832877,0.0,39.0,1,0,0,0,0,0,1,...,0.90449,0.009436,0.037081,3.974749,0.000709,7.003705,0.000516,98.77597,0.015018,0.083278
4,49.906849,4.786027,33.0,0,0,0,0,0,0,1,...,0.892657,0.007685,0.037656,4.619659,0.000573,3.588207,0.00087,102.951453,0.017162,0.187017
5,54.238356,0.0,42.0,0,0,0,0,0,0,1,...,0.824312,0.010241,0.039801,3.895226,0.000638,7.476947,0.000517,106.243797,0.019085,0.077944
8,54.687671,18.343836,0.0,1,0,0,0,0,0,0,...,1.157896,0.008249,0.027284,3.13838,0.000388,27.122302,0.000119,86.134552,0.008275,0.025806
10,61.728767,0.0,35.0,1,0,0,0,0,0,0,...,0.743133,0.007387,0.038802,5.139316,0.00046,4.229215,0.000744,102.92669,0.011452,0.163202


In [59]:
X.to_csv('./../../../data_source/to_analysis/gauss05_complete.csv')

## Combine Feature Sets

In [None]:
ct_concat = pd.concat(ct_data, axis=1)
pet_concat = pd.concat(pet_data, axis=1)

ct_concat.shape, pet_concat.shape

In [None]:
# Save to disk.
#ct_concat.to_csv(path_ct_concat)
#pet_concat.to_csv(path_pet_concat)

In [None]:

pet_params.head()

In [None]:
ct_concat.head()

In [None]:
pet_concat.head()

In [None]:
path_ct_concat = './../../../data_source/to_analysis/ct_no_filter_concat.csv'
path_pet_concat = './../../../data_source/to_analysis/pet_no_filter_concat.csv'

In [None]:
X = pd.concat(
    (clinical, ct_set, pet_set, pet_params), axis=1
)

In [None]:
X.head()

In [None]:
X.to_csv('./../../../data_source/to_analysis/squareroot_complete.csv')

In [None]:
ct_set.to_csv('./../../../data_source/to_analysis/sqroot/ct_128_sqroot.csv')

In [None]:
pet_set.to_csv('./../../../data_source/to_analysis/sqroot/pet_16_sqroot.csv')