# HSCT survival: data cleaning and encoding

## Notebook set-up

In [None]:
import pickle
import pandas as pd
from sklearn.preprocessing import TargetEncoder
import configuration as config
import functions.encoding as encode_funcs

pd.set_option('display.max_rows', 500)


# Some cleaning/encoding options
knn_neighbors=5
one_hot_drop, collinearity='first', 'no-multicollinearity'

# Base data input file: ID column set as index, missing string placeholders converted to nan
# ordinal categorical features translated to numerical categorical where possible
translated_features_file=f'{config.DATA_PATH}/processed/01.1-features_translated.pkl'

# Feature data type definition file
feature_types_dict_file=f'{config.DATA_PATH}/processed/01.1-feature_type_dict.pkl'

# Nominal and ordinal features ordinal encoded.
# NAN values encoded as 'missing' for categorical features
# and KNN imputed for numerical features.
ordinal_all_nan_encoded_data_df_file=f'{config.DATA_PATH}/processed/02.1-{collinearity}_ordinal_all_nan_encoded_data_df.parquet'

# Nominal and ordinal features ordinal encoded encoded.
# NANs filled in by KNN imputation for all features.
ordinal_all_nan_imputed_data_df_file=f'{config.DATA_PATH}/processed/02.1-{collinearity}_ordinal_all_nan_imputed_data_df.parquet'

# Nominal features one hot encoded, ordinal features ordinal encoded. 
# Data with with NAN values encoded as missing for categorical features 
# and KNN imputed for numerical features.
one_hot_ordinal_nan_encoded_data_df_file=f'{config.DATA_PATH}/processed/02.1-{collinearity}_one_hot_ordinal_nan_encoded_data_df.parquet'

# Nominal features one hot encoded, ordinal features ordinal encoded.
# NANs filled in by KNN imputation for all features.
one_hot_ordinal_nan_imputed_data_df_file=f'{config.DATA_PATH}/processed/02.1-{collinearity}_one_hot_ordinal_nan_imputed_data_df.parquet'

# All ordinal and nominal features target encoded on EFS
binary_target_encoded_data_file=f'{config.DATA_PATH}/processed/02.1-binary_target_encoded_data_df'

# All ordinal and nominal features target encoded on EFS time
continuous_target_encoded_data_file=f'{config.DATA_PATH}/processed/02.1-binary_target_encoded_data_df'

# Save dataset definitions
datasets={
    'Nominal one-hot/ordinal encoded, NANs encoded':one_hot_ordinal_nan_encoded_data_df_file,
    'Nominal one-hot/ordinal encoded, NANs imputed':one_hot_ordinal_nan_imputed_data_df_file,
    'All ordinal encoded, NAN encoded':ordinal_all_nan_encoded_data_df_file,
    'All ordinal encoded, NAN imputed':ordinal_all_nan_imputed_data_df_file,
    'Binary target encoded':binary_target_encoded_data_file,
    'Continuous target encoded':continuous_target_encoded_data_file
}

# Dataset definition file
datasets_file=f'{config.DATA_PATH}/processed/02.1-dataset_definitions.pkl'

# Save the dataset metadata
with open(datasets_file, 'wb') as output_file:
    pickle.dump(datasets, output_file)

## 1. Input data

### 1.1. Feature type definitions

In [2]:
# Load the feature data type definitions
with open(feature_types_dict_file, 'rb') as input_file:
    feature_types_dict=pickle.load(input_file)

print('Feature types:\n')

for feature_type, features in feature_types_dict.items():
    print(f'{feature_type}\n{features}\n')

Feature types:

Interval
['donor_age', 'age_at_hct']

Ordinal
['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'hla_match_b_low', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10', 'dri_score', 'cyto_score', 'cmv_status', 'cyto_score_detail']

Nominal
['psych_disturb', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'tce_imm_match', 'rituximab', 'prod_type', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']

Labels
['efs', 'e

### 1.2. Data

In [3]:
# Load the dataset
data_df=pd.read_parquet(translated_features_file)

# Remove and preserve the labels
labels_df=data_df[['efs','efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Save the unencoded 'race_group' feature
race_group=data_df['race_group']

## 2. Encoding and NAN handling schemes

### 2.1. One-hot encode nominal features with missing value string

In [None]:
# Load the dataset
data_df=pd.read_parquet(translated_features_file)

# Remove and preserve the labels
labels_df=data_df[['efs','efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Save the unencoded 'race_group' feature
race_group=data_df['race_group']

# Encode the nominal features
encoded_nominal_features_df=encode_funcs.one_hot_nan_encoded(
    data_df,
    feature_types_dict['Nominal']
)

# Encode the ordinal features
encoded_ordinal_features_df=encode_funcs.ordinal_encode_nan_imputed(
    data_df,
    feature_types_dict['Ordinal']
)

# Clean NANs in the interval features
cleaned_interval_features_df=encode_funcs.impute_numerical_features(
    data_df,
    feature_types_dict['Interval']
)

# Join the data back together
result_df=pd.concat([encoded_nominal_features_df, encoded_ordinal_features_df, cleaned_interval_features_df], axis=1)
print(f'Re-combined data: {result_df.shape}')

# Add back the labels
result_df=pd.concat([result_df, labels_df], axis=1)
print(f'Labeled re-combined data: {data_df.shape}\n')

# Add back un-encoded race group
result_df['race_group']=race_group

# Save
result_df.to_parquet(one_hot_ordinal_nan_encoded_data_df_file)

# Inspect
result_df.info(verbose=True, show_counts=True)

### 2.2. One-hot encode nominal features with missing value imputation

In [None]:
# Load the dataset
data_df=pd.read_parquet(translated_features_file)

# Remove and preserve the labels
labels_df=data_df[['efs','efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Save the unencoded 'race_group' feature
race_group=data_df['race_group']

# Encode the nominal features
encoded_nominal_features_df=encode_funcs.one_hot_encode_nan_imputed(
    data_df,
    feature_types_dict['Nominal']
)

# Encode the ordinal features
encoded_ordinal_features_df=encode_funcs.ordinal_encode_nan_imputed(
    data_df,
    feature_types_dict['Ordinal']
)

# Clean NANs in the interval features
cleaned_interval_features_df=encode_funcs.impute_numerical_features(
    data_df,
    feature_types_dict['Interval']
)

# Join the data back together
result_df=pd.concat([encoded_nominal_features_df, encoded_ordinal_features_df, cleaned_interval_features_df], axis=1)
print(f'\nRe-combined data: {result_df.shape}')

# Add back the labels
result_df=pd.concat([result_df, labels_df], axis=1)
print(f'Labeled re-combined data: {data_df.shape}\n')

# Add back un-encoded race group
result_df['race_group']=race_group

# Save
result_df.to_parquet(one_hot_ordinal_nan_imputed_data_df_file)

# Inspect
result_df.info(verbose=True, show_counts=True)


One-hot encoding input data: (28800, 57)
Feature data: (28800, 31)
On-hot encoded, imputed feature data: (28800, 114)

Ordinal encoding input data: (28800, 57)
Feature data: (28800, 24)
Ordinal encoded feature data: (28800, 24)
Imputed, ordinal encoded feature data: (28800, 24)

Imputation input data: (28800, 22)
Imputed numerical data: (28800, 2)

Re-combined data: (28800, 140)
Labeled re-combined data: (28800, 57)

<class 'pandas.core.frame.DataFrame'>
Index: 28800 entries, 0 to 28799
Data columns (total 143 columns):
 #    Column                         Non-Null Count  Dtype  
---   ------                         --------------  -----  
 0    psych_disturb_1.0              28800 non-null  int32  
 1    psych_disturb_2.0              28800 non-null  int32  
 2    diabetes_1.0                   28800 non-null  int32  
 3    diabetes_2.0                   28800 non-null  int32  
 4    tbi_status_1.0                 28800 non-null  int32  
 5    tbi_status_2.0                 28800 non

### 2.3. Label encode nominal and ordinal features with 'missing' level

In [None]:
# Load the dataset
data_df=pd.read_parquet(translated_features_file)

# Remove and preserve the labels
labels_df=data_df[['efs','efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Save the unencoded 'race_group' feature
race_group=data_df['race_group']

# Encode the nominal & ordinal features
encoded_categorical_features_df=encode_funcs.ordinal_encode_nan_encoded(
    data_df,
    feature_types_dict['Nominal'] + feature_types_dict['Ordinal']
)

# Clean NANs in the interval features
cleaned_interval_features_df=encode_funcs.impute_numerical_features(
    data_df,
    feature_types_dict['Interval']
)

# Join the data back together
result_df=pd.concat([encoded_categorical_features_df, cleaned_interval_features_df], axis=1)
print(f'Re-combined data: {result_df.shape}')

# Add back the labels
result_df=pd.concat([result_df, labels_df], axis=1)
print(f'Labeled re-combined data: {data_df.shape}\n')

# Add back un-encoded race group
result_df['race_group']=race_group

# Save
result_df.to_parquet(ordinal_all_nan_encoded_data_df_file)

# Inspect
result_df.info(verbose=True, show_counts=True)



Ordinal encoding input data: (28800, 57)
Feature data: (28800, 55)
Ordinal encoded feature data: (28800, 55)

Imputation input data: (28800, 22)
Imputed numerical data: (28800, 2)
Re-combined data: (28800, 57)
Labeled re-combined data: (28800, 57)

<class 'pandas.core.frame.DataFrame'>
Index: 28800 entries, 0 to 28799
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   psych_disturb           28800 non-null  int32  
 1   diabetes                28800 non-null  int32  
 2   tbi_status              28800 non-null  int32  
 3   arrhythmia              28800 non-null  int32  
 4   graft_type              28800 non-null  int32  
 5   vent_hist               28800 non-null  int32  
 6   renal_issue             28800 non-null  int32  
 7   pulm_severe             28800 non-null  int32  
 8   prim_disease_hct        28800 non-null  int32  
 9   tce_imm_match           28800 non-null  int32  
 10  ritux

### 2.4. Label encode nominal and ordinal features with NAN imputation

In [None]:
# Load the dataset
data_df=pd.read_parquet(translated_features_file)

# Remove and preserve the labels
labels_df=data_df[['efs','efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Save the unencoded 'race_group' feature
race_group=data_df['race_group']

# Encode the nominal & ordinal features
encoded_categorical_features_df=encode_funcs.ordinal_encode_nan_imputed(
    data_df,
    feature_types_dict['Nominal'] + feature_types_dict['Ordinal']
)

# Clean NANs in the interval features
cleaned_interval_features_df=encode_funcs.impute_numerical_features(
    data_df,
    feature_types_dict['Interval']
)

# Join the data back together
result_df=pd.concat([encoded_categorical_features_df, cleaned_interval_features_df], axis=1)
print(f'Re-combined data: {result_df.shape}')

# Add back the labels
result_df=pd.concat([result_df, labels_df], axis=1)
print(f'Labeled re-combined data: {data_df.shape}\n')

# Add back un-encoded race group
result_df['race_group']=race_group

# Save
result_df.to_parquet(ordinal_all_nan_imputed_data_df_file)

# Inspect
result_df.info(verbose=True, show_counts=True)


Ordinal encoding input data: (28800, 57)
Feature data: (28800, 55)
Ordinal encoded feature data: (28800, 55)
Imputed, ordinal encoded feature data: (28800, 55)

Imputation input data: (28800, 22)
Imputed numerical data: (28800, 2)
Re-combined data: (28800, 57)
Labeled re-combined data: (28800, 57)

<class 'pandas.core.frame.DataFrame'>
Index: 28800 entries, 0 to 28799
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   psych_disturb           28800 non-null  int32  
 1   diabetes                28800 non-null  int32  
 2   tbi_status              28800 non-null  int32  
 3   arrhythmia              28800 non-null  int32  
 4   graft_type              28800 non-null  int32  
 5   vent_hist               28800 non-null  int32  
 6   renal_issue             28800 non-null  int32  
 7   pulm_severe             28800 non-null  int32  
 8   prim_disease_hct        28800 non-null  int32  
 9   tce_imm

### 2.5. Binary target encode everything on efs

In [None]:
# Load the dataset
data_df=pd.read_parquet(translated_features_file)

# Remove and preserve the labels
labels_df=data_df[['efs','efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Save the unencoded 'race_group' feature
race_group=data_df['race_group']

# Encode the nominal & ordinal features
encoder=TargetEncoder()
encoded_categorical_features=encoder.fit_transform(data_df[feature_types_dict['Nominal'] + feature_types_dict['Ordinal']], labels_df['efs'])

encoded_categorical_features_df=pd.DataFrame(
    encoded_categorical_features,
    columns=feature_types_dict['Nominal'] + feature_types_dict['Ordinal']
)

# Clean NANs in the interval features
cleaned_interval_features_df=encode_funcs.impute_numerical_features(
    data_df,
    feature_types_dict['Interval']
)

# Join the data back together
result_df=pd.concat([encoded_categorical_features_df, cleaned_interval_features_df], axis=1)
print(f'Re-combined data: {result_df.shape}')

# Add back the labels
result_df=pd.concat([result_df, labels_df], axis=1)
print(f'Labeled re-combined data: {data_df.shape}\n')

# Add back un-encoded race group
result_df['race_group']=race_group

# Save
result_df.to_parquet(binary_target_encoded_data_file)

# Inspect
result_df.info(verbose=True, show_counts=True)


Imputation input data: (28800, 22)
Imputed numerical data: (28800, 2)
Re-combined data: (28800, 57)
Labeled re-combined data: (28800, 57)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   psych_disturb           28800 non-null  float64
 1   diabetes                28800 non-null  float64
 2   tbi_status              28800 non-null  float64
 3   arrhythmia              28800 non-null  float64
 4   graft_type              28800 non-null  float64
 5   vent_hist               28800 non-null  float64
 6   renal_issue             28800 non-null  float64
 7   pulm_severe             28800 non-null  float64
 8   prim_disease_hct        28800 non-null  float64
 9   tce_imm_match           28800 non-null  float64
 10  rituximab               28800 non-null  float64
 11  prod_type               28800 non-null  float64
 12  cond

### 2.6. Continuous target encode everything on efs_time

In [None]:
# Load the dataset
data_df=pd.read_parquet(translated_features_file)

# Remove and preserve the labels
labels_df=data_df[['efs','efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Save the unencoded 'race_group' feature
race_group=data_df['race_group']

# Encode the nominal & ordinal features
encoder=TargetEncoder()
encoded_categorical_features=encoder.fit_transform(data_df[feature_types_dict['Nominal'] + feature_types_dict['Ordinal']], labels_df['efs_time'])

encoded_categorical_features_df=pd.DataFrame(
    encoded_categorical_features,
    columns=feature_types_dict['Nominal'] + feature_types_dict['Ordinal']
)

# Clean NANs in the interval features
cleaned_interval_features_df=encode_funcs.impute_numerical_features(
    data_df,
    feature_types_dict['Interval']
)

# Join the data back together
result_df=pd.concat([encoded_categorical_features_df, cleaned_interval_features_df], axis=1)
print(f'Re-combined data: {result_df.shape}')

# Add back the labels
result_df=pd.concat([result_df, labels_df], axis=1)
print(f'Labeled re-combined data: {data_df.shape}\n')

# Add back un-encoded race group
result_df['race_group']=race_group

# Save
result_df.to_parquet(continuous_target_encoded_data_file)

# Inspect
result_df.info(verbose=True, show_counts=True)