# Post processing of Extracted Radiomic Features 

* Uniquely label features.
* Remove features constant across gray level discretizations.

In [1]:
import sys
sys.path.append('./../utils/')

import os
import re
import glob

from scipy.stats import friedmanchisquare

import numpy as np
import pandas as pd

In [2]:
def check_gl_redudancy(df, thresh, gl_bins):
    
    output = {}
    to_check = []
    for col in df.columns:
        if 'bins' in col:
            to_check.append(('_').join(col.split('_')[:-1]))
        else:
            output[col] = df.loc[:, col]

    reduced = []
    for col in np.unique(to_check):
        f = df.filter(regex=col)
        f = f / np.max(f, axis=0)
    
        if len(f.columns) == len(gl_bins):
            if sum(f.var(axis=1)) <= thresh:
                output[col] = f.iloc[:, 0]
                reduced.append(col)
            else:
                for num, num_bins in enumerate(gl_bins):
                    label = f'{col}_{num_bins}bins'
                    output[label] = f.iloc[:, num]
                    
    return pd.DataFrame(output), reduced

In [3]:
#path_to_dir = './../../data_source/images/ct_nrrd/'
path_to_dir = './../../data_source/images/ct_removed_broken_slices/'

#path_to_lrr = './../../data_source/to_analysis/original_images/lrr_original_images.csv'
path_to_lrr = './../../data_source/to_analysis/removed_broken_slices/lrr_removed_broken_slices.csv'

#path_to_dfs = './../../data_source/to_analysis/original_images/dfs_original_images.csv'
path_to_dfs = './../../data_source/to_analysis/removed_broken_slices/dfs_removed_broken_slices.csv'

#path_no_ct = './../../data_source/to_analysis/original_images/no_ct_original_images.csv'
path_no_ct = './../../data_source/to_analysis/removed_broken_slices/no_ct_removed_broken_slices.csv'

#path_pet_only = './../../data_source/to_analysis/original_images/pet_only_original_images.csv'
path_pet_only = './../../data_source/to_analysis/removed_broken_slices/pet_only_removed_broken_slices.csv'

#path_ct_only = './../../data_source/to_analysis/original_images/ct_only_original_images.csv'
path_ct_only = './../../data_source/to_analysis/removed_broken_slices/ct_only_removed_broken_slices.csv'

#path_to_feature_matrix = './../../data_source/to_analysis/original_images/all_features_original_images.csv'
path_to_feature_matrix = './../../data_source/to_analysis/removed_broken_slices/all_features_removed_broken_slices.csv'

#path_to_texture_features = './../../data_source/to_analysis/original_images/texture_original_images.csv'
path_to_texture_features = './../../data_source/to_analysis/removed_broken_slices/texture_removed_broken_slices.csv'

#path_to_firstorder_features = './../../data_source/to_analysis/original_images/firstorder_original_images.csv'
path_to_firstorder_features = './../../data_source/to_analysis/removed_broken_slices/firstorder_removed_broken_slices.csv'

#path_to_shape_features = './../../data_source/radiomic_features/shape/shape_original_masks.csv'
path_to_shape_features = './../../data_source/radiomic_features/shape/shape_removed_broken_slices.csv'

#path_category_idx = './../../data_source/original_images_feature_group_idx.npy'
path_category_idx = './../../data_source/removed_broken_slices_feature_group_idx.npy'

#path_ct_firstorder_32 = './../../data_source/radiomic_features/firstorder_original_images/ct32_firstorder_original_images.csv'
#path_ct_firstorder_64 = './../../data_source/radiomic_features/firstorder_original_images/ct64_firstorder_original_images.csv'
#path_ct_firstorder_128 = './../../data_source/radiomic_features/firstorder_original_images/ct128_firstorder_original_images.csv'
path_ct_firstorder_32 = './../../data_source/radiomic_features/firstorder_removed_broken_slices/ct32_firstorder_removed_broken_slices.csv'
path_ct_firstorder_64 = './../../data_source/radiomic_features/firstorder_removed_broken_slices/ct64_firstorder_removed_broken_slices.csv'
path_ct_firstorder_128 = './../../data_source/radiomic_features/firstorder_removed_broken_slices/ct128_firstorder_removed_broken_slices.csv'

#path_pet_firstorder_32 = './../../data_source/radiomic_features/firstorder_original_images/pet32_firstorder_original_images.csv'
#path_pet_firstorder_64 = './../../data_source/radiomic_features/firstorder_original_images/pet64_firstorder_original_images.csv'
#path_pet_firstorder_128 = './../../data_source/radiomic_features/firstorder_original_images/pet128_firstorder_original_images.csv'
path_pet_firstorder_32 = './../../data_source/radiomic_features/firstorder_removed_broken_slices/pet32_firstorder_removed_broken_slices.csv'
path_pet_firstorder_64 = './../../data_source/radiomic_features/firstorder_removed_broken_slices/pet64_firstorder_removed_broken_slices.csv'
path_pet_firstorder_128 = './../../data_source/radiomic_features/firstorder_removed_broken_slices/pet128_firstorder_removed_broken_slices.csv'

#path_ct_texture_32 = './../../data_source/radiomic_features/texture_original_images/ct32_texture_original_images.csv'
#path_ct_texture_64 = './../../data_source/radiomic_features/texture_original_images/ct64_texture_original_images.csv'
#path_ct_texture_128 = './../../data_source/radiomic_features/texture_original_images/ct128_texture_original_images.csv'
path_ct_texture_32 = './../../data_source/radiomic_features/texture_removed_broken_slices/ct32_texture_removed_broken_slices.csv'
path_ct_texture_64 = './../../data_source/radiomic_features/texture_removed_broken_slices/ct64_texture_removed_broken_slices.csv'
path_ct_texture_128 = './../../data_source/radiomic_features/texture_removed_broken_slices/ct128_texture_removed_broken_slices.csv'

#path_pet_texture_32 = './../../data_source/radiomic_features/texture_original_images/pet32_texture_original_images.csv'
#path_pet_texture_64 = './../../data_source/radiomic_features/texture_original_images/pet64_texture_original_images.csv'
#path_pet_texture_128 = './../../data_source/radiomic_features/texture_original_images/pet128_texture_original_images.csv'
path_pet_texture_32 = './../../data_source/radiomic_features/texture_removed_broken_slices/pet32_texture_removed_broken_slices.csv'
path_pet_texture_64 = './../../data_source/radiomic_features/texture_removed_broken_slices/pet64_texture_removed_broken_slices.csv'
path_pet_texture_128 = './../../data_source/radiomic_features/texture_removed_broken_slices/pet128_texture_removed_broken_slices.csv'
  
patient_id = []
for fname in sorted(os.listdir(path_to_dir)):
    try:
        patient_id.append(int(re.findall(r'\d+', fname)[0]))
    except:
        pass     
len(patient_id)

187

# Shape Features

In [4]:
# NOTE: Using original tumor volume despite removing slices.
shape = pd.read_csv(path_to_shape_features, index_col=0)
shape.index = patient_id

shape.head()

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,original_shape_Sphericity,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_VoxelVolume
2,0.738882,0.723925,27.060529,37.380273,42.953463,45.54119,43.65776,46.411206,17098.541667,27.619612,0.660413,4860.055715,0.284238,17142.0
4,0.7969,0.629917,19.845151,31.504408,39.446166,36.400549,30.364453,39.560081,9626.916667,25.105855,0.700069,3126.087371,0.324724,9661.0
5,0.600926,0.53514,22.515072,42.073251,46.957428,43.965896,32.802439,47.339202,16175.708333,25.282894,0.761164,4063.633046,0.251218,16214.0
8,0.784571,0.414247,30.263897,73.057649,75.133215,81.859636,66.753277,84.291162,69280.083333,57.318945,0.519631,15698.615155,0.226596,69354.0
10,0.69032,0.539743,19.449801,36.035312,34.176015,38.832976,33.837849,43.874822,10705.375,24.875896,0.642079,3658.448414,0.341739,10749.0


In [5]:
shape.shape

(187, 14)

# CT First Order

In [6]:
ct_firstorder_32 = pd.read_csv(path_ct_firstorder_32, index_col=0)
ct_firstorder_64 = pd.read_csv(path_ct_firstorder_64, index_col=0)
ct_firstorder_128 = pd.read_csv(path_ct_firstorder_128, index_col=0)

ct_firstorder_32.columns = ['CT_{}_32bins'.format(col) for col in ct_firstorder_32.columns]
ct_firstorder_64.columns = ['CT_{}_64bins'.format(col) for col in ct_firstorder_64.columns]
ct_firstorder_128.columns = ['CT_{}_128bins'.format(col) for col in ct_firstorder_128.columns]

raw_ct_firstorder = pd.concat(
    (
        ct_firstorder_32,
        ct_firstorder_64,
        ct_firstorder_128
    ),
    axis=1
)
raw_ct_firstorder.index = patient_id

In [7]:
ct_firstorder_32.shape, ct_firstorder_64.shape, ct_firstorder_128.shape

((187, 18), (187, 18), (187, 18))

In [8]:
ct_firstorder, ct_fs_reduced = check_gl_redudancy(raw_ct_firstorder, thresh=1e-15, gl_bins=[32, 64, 128])
ct_firstorder.shape, ct_fs_reduced

((187, 21),
 ['CT_original_firstorder_10Percentile',
  'CT_original_firstorder_90Percentile',
  'CT_original_firstorder_Energy',
  'CT_original_firstorder_InterquartileRange',
  'CT_original_firstorder_Kurtosis',
  'CT_original_firstorder_Maximum',
  'CT_original_firstorder_MeanAbsoluteDeviation',
  'CT_original_firstorder_Median',
  'CT_original_firstorder_Minimum',
  'CT_original_firstorder_Range',
  'CT_original_firstorder_RobustMeanAbsoluteDeviation',
  'CT_original_firstorder_RootMeanSquared',
  'CT_original_firstorder_Skewness',
  'CT_original_firstorder_TotalEnergy',
  'CT_original_firstorder_Variance'])

In [9]:
for r in ct_fs_reduced:
    f = raw_ct_firstorder.filter(regex=r)
    print(f.iloc[:5, :])

    CT_original_firstorder_10Percentile_32bins  \
2                                  1064.381323   
4                                   972.027466   
5                                  1020.141451   
8                                  1028.577020   
10                                 1054.821012   

    CT_original_firstorder_10Percentile_64bins  \
2                                  1064.381323   
4                                   972.027466   
5                                  1020.141451   
8                                  1028.577020   
10                                 1054.821012   

    CT_original_firstorder_10Percentile_128bins  
2                                   1064.381323  
4                                    972.027466  
5                                   1020.141451  
8                                   1028.577020  
10                                  1054.821012  
    CT_original_firstorder_90Percentile_32bins  \
2                                  1115.994507  

# PET First Order

In [10]:
pet_firstorder_32 = pd.read_csv(path_pet_firstorder_32, index_col=0)
pet_firstorder_64 = pd.read_csv(path_pet_firstorder_64, index_col=0)
pet_firstorder_128 = pd.read_csv(path_pet_firstorder_128, index_col=0)

pet_firstorder_32.columns = ['PET_{}_32bins'.format(col) for col in pet_firstorder_32.columns]
pet_firstorder_64.columns = ['PET_{}_64bins'.format(col) for col in pet_firstorder_64.columns]
pet_firstorder_128.columns = ['PET_{}_128bins'.format(col) for col in pet_firstorder_128.columns]

raw_pet_firstorder = pd.concat(
    (
        pet_firstorder_32,
        pet_firstorder_64,
        pet_firstorder_128
    ),
    axis=1
)
raw_pet_firstorder.index = patient_id

In [11]:
pet_firstorder_32.shape, pet_firstorder_64.shape, pet_firstorder_128.shape

((187, 18), (187, 18), (187, 18))

In [12]:
pet_firstorder, pet_fs_reduced = check_gl_redudancy(raw_pet_firstorder, thresh=1e-15, gl_bins=[32, 64, 128])
pet_firstorder.shape, pet_fs_reduced

((187, 21),
 ['PET_original_firstorder_10Percentile',
  'PET_original_firstorder_90Percentile',
  'PET_original_firstorder_Energy',
  'PET_original_firstorder_InterquartileRange',
  'PET_original_firstorder_Kurtosis',
  'PET_original_firstorder_Maximum',
  'PET_original_firstorder_MeanAbsoluteDeviation',
  'PET_original_firstorder_Median',
  'PET_original_firstorder_Minimum',
  'PET_original_firstorder_Range',
  'PET_original_firstorder_RobustMeanAbsoluteDeviation',
  'PET_original_firstorder_RootMeanSquared',
  'PET_original_firstorder_Skewness',
  'PET_original_firstorder_TotalEnergy',
  'PET_original_firstorder_Variance'])

In [13]:
for r in pet_fs_reduced:
    f = raw_pet_firstorder.filter(regex=r)
    print(f.iloc[:5, :])

    PET_original_firstorder_10Percentile_32bins  \
2                                      3.077370   
4                                      2.359130   
5                                      2.561257   
8                                      1.759327   
10                                     2.186233   

    PET_original_firstorder_10Percentile_64bins  \
2                                      3.077370   
4                                      2.359130   
5                                      2.561257   
8                                      1.759327   
10                                     2.186233   

    PET_original_firstorder_10Percentile_128bins  
2                                       3.077370  
4                                       2.359130  
5                                       2.561257  
8                                       1.759327  
10                                      2.186233  
    PET_original_firstorder_90Percentile_32bins  \
2                            

# CT  Texture

In [14]:
ct_texture_32 = pd.read_csv(path_ct_texture_32, index_col=0)
ct_texture_64 = pd.read_csv(path_ct_texture_64, index_col=0)
ct_texture_128 = pd.read_csv(path_ct_texture_128, index_col=0)

ct_texture_32.columns = ['CT_{}_32bins'.format(col) for col in ct_texture_32.columns]
ct_texture_64.columns = ['CT_{}_64bins'.format(col) for col in ct_texture_64.columns]
ct_texture_128.columns = ['CT_{}_128bins'.format(col) for col in ct_texture_128.columns]

raw_ct_texture = pd.concat(
    (
        ct_texture_32,
        ct_texture_64,
        ct_texture_128
    ),
    axis=1
)
raw_ct_texture.index = patient_id

In [15]:
ct_texture_32.shape, ct_texture_64.shape, ct_texture_128.shape

((187, 75), (187, 75), (187, 75))

In [16]:
ct_texture, ct_text_reduced = check_gl_redudancy(raw_ct_texture, thresh=1e-15, gl_bins=[32, 64, 128])
ct_texture.shape, ct_text_reduced

((187, 204), [])

# PET Texture

In [17]:
pet_texture_32 = pd.read_csv(path_pet_texture_32, index_col=0)
pet_texture_64 = pd.read_csv(path_pet_texture_64, index_col=0)
pet_texture_128 = pd.read_csv(path_pet_texture_128, index_col=0)

pet_texture_32.columns = ['PET_{}_32bins'.format(col) for col in pet_texture_32.columns]
pet_texture_64.columns = ['PET_{}_64bins'.format(col) for col in pet_texture_64.columns]
pet_texture_128.columns = ['PET_{}_128bins'.format(col) for col in pet_texture_128.columns]

raw_pet_texture = pd.concat(
    (
        pet_texture_32,
        pet_texture_64,
        pet_texture_128
    ),
    axis=1
)
raw_pet_texture.index = patient_id

In [18]:
pet_texture_32.shape, pet_texture_64.shape, pet_texture_128.shape

((187, 75), (187, 75), (187, 75))

In [19]:
pet_texture, pet_text_reduced = check_gl_redudancy(raw_pet_texture, thresh=1e-15, gl_bins=[32, 64, 128])
pet_texture.shape, pet_text_reduced

((187, 204), [])

# Concatenate Features

In [20]:
ct_features = pd.concat((ct_firstorder, ct_texture), axis=1)
pet_features = pd.concat((pet_firstorder, pet_texture), axis=1)

# Add modality labels to GL bin labels.
radiomics_features = pd.concat((shape, ct_features, pet_features), axis=1)
radiomics_features.index = patient_id

radiomics_features.shape

(187, 464)

In [21]:
# Concat with clinical variables and PET params.
clinical_features = pd.read_csv('./../../data_source/to_analysis/removed_broken_slices/clinical.csv', index_col=0)
pet_params = pd.read_csv('./../../data_source/to_analysis/pet_params.csv', index_col=0)
lrr = pd.read_csv('./../../data_source/to_analysis/target_lrr.csv', index_col=0)
dfs = pd.read_csv('./../../data_source/to_analysis/target_dfs.csv', index_col=0)

# HACK:
tmp_patient_id = np.load('./../../data_source/patient_id.npy')
patient_id = clinical_features.index

pet_params.index = tmp_patient_id
lrr.index = tmp_patient_id
dfs.index = tmp_patient_id

clinical_features = clinical_features.loc[patient_id, :]
pet_params = pet_params.loc[patient_id, :]
lrr = lrr.loc[patient_id, :]
dfs = dfs.loc[patient_id]

lrr.to_csv(path_to_lrr)
dfs.to_csv(path_to_dfs)

lrr.shape, dfs.shape

((187, 1), (187, 1))

In [22]:
X = pd.concat((clinical_features, radiomics_features, pet_params), axis=1)
np.shape(X)

(187, 505)

In [23]:
X.to_csv(path_to_feature_matrix)

In [24]:
# Create texture features dataset for comparison of artifact handling method.
texture_features = pd.concat((ct_texture, pet_texture), axis=1)
texture_features.to_csv(path_to_texture_features)

texture_features.shape

(187, 408)

In [25]:
# Create firstorder features dataset for comparison of artifact handling method.
firstorder_features = pd.concat((ct_firstorder, pet_firstorder), axis=1)
firstorder_features.to_csv(path_to_firstorder_features)

firstorder_features.shape

(187, 42)

In [26]:
# PET only dataset.
PET_only = pd.concat((pet_firstorder, pet_texture), axis=1)
PET_only.to_csv(path_pet_only)

PET_only.shape

(187, 225)

In [27]:
# CT only dataset.
CT_only = pd.concat((ct_firstorder, ct_texture), axis=1)
CT_only.to_csv(path_ct_only)

CT_only.shape

(187, 225)

In [28]:
# No CT features.
X_CT = X.filter(regex='CT')
X_not_CT = X.drop(X_CT, axis=1)
X_not_CT.to_csv(path_no_ct)

X_CT.shape

(187, 225)

# Produce Feature Category Indices

In [29]:
col_idx = {
    'shape': 2, 
    'CT_original_firstorder': 3,
    'CT_original_glcm': 4,
    'CT_original_gldm': 5,
    'CT_original_glrlm': 6,
    'CT_original_glszm': 7,
    'CT_original_ngtdm': 8,
    'PET_original_firstorder': 9,
    'PET_original_glcm': 10,
    'PET_original_gldm': 11,
    'PET_original_glrlm': 12,
    'PET_original_glszm': 13,
    'PET_original_ngtdm': 14,
    'PETparam': 15
}
feature_idx = np.zeros(X.shape[1], dtype=np.int32)
for label in col_idx.keys():
    target_cols = list(X.filter(regex=label).columns)
    i = np.squeeze(np.where(np.isin(X.columns, target_cols)))
    feature_idx[i] = np.tile(col_idx[label], np.size(i))
    
feature_idx[feature_idx == 0] = 1

In [30]:
# Sanity checks.
assert len(feature_idx) == X.shape[1]
assert np.min(feature_idx) == 1
np.save(path_category_idx, feature_idx)

In [31]:
shape = pd.read_csv(path_to_shape_features, index_col=0)
print('Number of shape features:', shape.shape)

Number of shape features: (187, 14)


In [32]:
ct_firstorder_32 = pd.read_csv(path_ct_firstorder_32, index_col=0)
ct_firstorder_64 = pd.read_csv(path_ct_firstorder_64, index_col=0)
ct_firstorder_128 = pd.read_csv(path_ct_firstorder_128, index_col=0)

print('Number of 32 bins CT firstorder features:', ct_firstorder_32.shape[1])
print('Number of 64 bins CT firstorder features:', ct_firstorder_64.shape[1])
print('Number of 128 bins CT firstorder features:', ct_firstorder_128.shape[1])

Number of 32 bins CT firstorder features: 18
Number of 64 bins CT firstorder features: 18
Number of 128 bins CT firstorder features: 18


In [33]:
pet_firstorder_32 = pd.read_csv(path_pet_firstorder_32, index_col=0)
pet_firstorder_64 = pd.read_csv(path_pet_firstorder_64, index_col=0)
pet_firstorder_128 = pd.read_csv(path_pet_firstorder_128, index_col=0)

print('Number of 32 bins PET firstorder features:', pet_firstorder_32.shape[1])
print('Number of 64 bins PET firstorder features:', pet_firstorder_64.shape[1])
print('Number of 128 bins PET firstorder features:', pet_firstorder_128.shape[1])

Number of 32 bins PET firstorder features: 18
Number of 64 bins PET firstorder features: 18
Number of 128 bins PET firstorder features: 18


In [34]:
ct_texture_32 = pd.read_csv(path_ct_texture_32, index_col=0)
ct_texture_64 = pd.read_csv(path_ct_texture_64, index_col=0)
ct_texture_128 = pd.read_csv(path_ct_texture_128, index_col=0)

print('Number of 32 bins CT texture features:', ct_texture_32.shape[1])
print('Number of 64 bins CT texture features:', ct_texture_64.shape[1])
print('Number of 128 bins CT texture features:', ct_texture_128.shape[1])

Number of 32 bins CT texture features: 75
Number of 64 bins CT texture features: 75
Number of 128 bins CT texture features: 75


In [35]:
pet_texture_32 = pd.read_csv(path_pet_texture_32, index_col=0)
pet_texture_64 = pd.read_csv(path_pet_texture_64, index_col=0)
pet_texture_128 = pd.read_csv(path_pet_texture_128, index_col=0)

print('Number of 32 bins PET texture features:', pet_texture_32.shape[1])
print('Number of 64 bins PET texture features:', pet_texture_64.shape[1])
print('Number of 128 bins PET texture features:', pet_texture_128.shape[1])

Number of 32 bins PET texture features: 75
Number of 64 bins PET texture features: 75
Number of 128 bins PET texture features: 75


In [36]:
y = pd.read_csv(
    './../../data_source/to_analysis/original_images/dfs_original_images.csv', 
    index_col=0
)
y.shape

(198, 1)

In [37]:
X = pd.read_csv(
    './../../data_source/to_analysis/original_images/all_features_original_images.csv', 
    index_col=0
)
X.shape

(198, 513)

In [38]:
# NOTE: Clinical features: 0
col_idx = {
    'shape': 1, 
    'CT': 2,
    'PET': 3 
}

In [39]:
feature_idx = np.zeros(X.shape[1], dtype=np.int32)
for label in col_idx.keys():
    target_cols = list(X.filter(regex=label).columns)
    i = np.squeeze(np.where(np.isin(X.columns, target_cols)))
    feature_idx[i] = np.tile(col_idx[label], np.size(i))

feature_idx

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

In [40]:
clinical_only = X.iloc[:, feature_idx == 0]
clinical_only.to_csv('./../../data_source/to_analysis/original_images/by_category/icc_scc_clinical_only.csv')

In [41]:
shape_only = X.iloc[:, feature_idx == 1]
shape_only.to_csv('./../../data_source/to_analysis/original_images/by_category/icc_scc_shape_only.csv')

In [42]:
pet_only = X.iloc[:, feature_idx == 3]
pet_only.to_csv('./../../data_source/to_analysis/original_images/by_category/icc_scc_pet_only.csv')

In [43]:
ct_only = X.iloc[:, feature_idx == 2]
ct_only.to_csv('./../../data_source/to_analysis/original_images/by_category/icc_scc_ct_only.csv')

In [44]:
idx = (feature_idx == 0).astype(int) + (feature_idx == 1).astype(int) + (feature_idx == 3).astype(int)
no_ct = X.iloc[:, idx.astype(bool)]
no_ct.to_csv('./../../data_source/to_analysis/original_images/by_category/icc_scc_no_ct.csv')

In [45]:
X = pd.read_csv(path_to_feature_matrix, index_col=0)
X.head()

Unnamed: 0,Age,Charlson,Years Smoking,Naxogin Days,Sex_M,ICD-10_C02,ICD-10_C03,ICD-10_C04,ICD-10_C05,ICD-10_C06,...,PET_original_ngtdm_Complexity_128bins,PET_original_ngtdm_Contrast_32bins,PET_original_ngtdm_Contrast_64bins,PET_original_ngtdm_Contrast_128bins,PET_original_ngtdm_Strength_32bins,PET_original_ngtdm_Strength_64bins,PET_original_ngtdm_Strength_128bins,PETparam_SUVpeak,PETparam_MTV,PETparam_TLG
2,60.832877,0,0.0,39.0,1,0,0,0,0,0,...,0.540025,0.710921,0.690858,0.688723,0.800831,0.792568,0.805628,21.616549,7.384,124.870726
4,49.906849,0,4.786027,33.0,0,0,0,0,0,0,...,0.221747,0.520454,0.47966,0.454068,0.813648,0.848662,0.908503,15.296275,3.406,41.554406
5,54.238356,0,0.0,42.0,0,0,0,0,0,0,...,0.159063,0.426206,0.379979,0.368077,0.311601,0.336818,0.351863,14.473272,7.934,86.22842
8,54.687671,0,18.343836,0.0,1,0,0,0,0,0,...,0.08688,0.186535,0.152097,0.136492,0.070294,0.082123,0.093375,10.510859,26.926,205.413389
10,61.728767,0,0.0,35.0,1,0,0,0,0,0,...,0.025393,0.230094,0.174644,0.1574,0.073946,0.091367,0.10241,7.21319,6.041,32.10377
