# Extracted Feature Processing 

In [1]:
import os
import re

import numpy as np
import pandas as pd

In [2]:
true_index = np.load('./../../../data_source/patient_id.npy')
true_index[:5]

array([ 2,  4,  5,  8, 10])

In [3]:
def get_file_name(path_to_file):
    name, _ = path_to_file.split('.')
    return name


def get_discr_level(path_to_file):
    return re.findall('\d+', path_to_file)[0]

In [4]:
# References to disk locations.
raw_source = './../../../data_source/radiomic_features/no_filter/'
prep_source = './../../../data_source/to_analysis/no_filter/'

# Target feature category.
target_class = 'original'

error_dir = './../../../data_source/radiomic_features/no_filter/errors/'

# Individual file names.
ct_labels = [
    'ct_no_filter30.csv',
    'ct_no_filter50.csv',
    'ct_no_filter70.csv',
    'ct_no_filter90.csv',
    'ct_no_filter110.csv',
    'ct_no_filter130.csv'
]
pet_labels = [
    'pet_no_filter30.csv',
    'pet_no_filter50.csv',
    'pet_no_filter70.csv',
    'pet_no_filter90.csv',
    'pet_no_filter110.csv',
    'pet_no_filter130.csv'
]

In [5]:
# Filter out redundant columns.
ct_data, pet_data = [], []
for ct_label, pet_label in zip(ct_labels, pet_labels):
    
    ct_path = os.path.join(raw_source, ct_label)
    pet_path = os.path.join(raw_source, pet_label)

    ct_data.append(
        pd.read_csv(ct_path, index_col=3).filter(regex=target_class)
    )
    pet_data.append(
        pd.read_csv(pet_path, index_col=3).filter(regex=target_class)
    )

In [6]:
# Sanity check on number of extracted features.
for ct_dset, pet_dset in zip(ct_data, pet_data):
    print(np.shape(ct_dset), np.shape(pet_dset))

(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)
(198, 92) (198, 92)


In [7]:
# Sanity check on shape features.
np.array_equal(ct_data[0].values, pet_data[0].values)

False

In [8]:
# Sanity check on indexing.
for ct_dset, pet_dset in zip(ct_data, pet_data):
    print(np.array_equal(ct_dset.index.values, true_index)) 
    print(np.array_equal(pet_dset.index.values, true_index))

True
True
True
True
True
True
True
True
True
True
True
True


In [9]:
verbose = 1

# Save data to disk.
for num, (ct_set, pet_set) in enumerate(zip(ct_data, pet_data)):
    # Update column labels.
    ct_set.columns = [
        'CT{}_{}'.format(get_discr_level(ct_labels[num]), label) 
        for label in ct_set.columns
    ]
    pet_set.columns = [
        'PET{}_{}'.format(get_discr_level(pet_labels[num]), label) 
        for label in pet_set.columns
    ]
    
    # Drop redundant columns.
    ct_redundant = ct_set.columns[ct_set.var() == 0.0].values
    pet_redundant = pet_set.columns[pet_set.var() == 0.0].values
    
    if len(ct_redundant) > 0:
        if verbose > 0:
            print('CT redundant: ', ct_redundant)

        ct_red_path = os.path.join(error_dir, 'redundant_{}'.format(ct_labels[num]))
        pd.Series(ct_redundant).to_csv(ct_red_path)
        ct_set.drop(ct_redundant, axis=1, inplace=True)
        
    if len(pet_redundant) > 0:        
        if verbose > 0:
            print('PET redundant: ', pet_redundant)
            
        pet_red_path = os.path.join(error_dir, 'redundant_{}'.format(pet_labels[num]))
        pd.Series(pet_redundant).to_csv(pet_red_path)
        pet_set.drop(pet_redundant, axis=1, inplace=True)
    
    # Drop columns with missing values.
    ct_miss = ct_set.columns[ct_set.isnull().any()].values
    pet_miss = pet_set.columns[pet_set.isnull().any()].values
        
    if len(ct_miss) > 0:
        if verbose > 0:
            print('CT missing: ', ct_miss)
        
        ct_miss_path = os.path.join(error_dir, 'missing_{}'.format(ct_labels[num]))
        pd.Series(ct_miss).to_csv(ct_miss_path)
        ct_set.drop(ct_miss, axis=1, inplace=True)
        
    if len(pet_miss) > 0:
        if verbose > 0:
            print('PET missing: ', pet_miss)
        
        pet_miss_path = os.path.join(error_dir, 'missing_{}'.format(pet_labels[num]))
        pd.Series(pet_miss).to_csv(ct_miss_path)
        pet_set.drop(pet_miss, axis=1, inplace=True)
        

    # Save data sets to disk.
    ct_set.to_csv(os.path.join(prep_source, ct_labels[num]))
    pet_set.to_csv(os.path.join(prep_source, pet_labels[num]))

CT missing:  ['CT30_original_ngtdm_Contrast']
CT missing:  ['CT50_original_ngtdm_Contrast']
CT missing:  ['CT70_original_ngtdm_Contrast']
CT missing:  ['CT90_original_ngtdm_Contrast']
CT missing:  ['CT110_original_ngtdm_Contrast']
CT missing:  ['CT130_original_ngtdm_Contrast']
