# HSCT survival: data cleaning and encoding

## Notebook set-up

In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

import configuration as config

pd.set_option('display.max_rows', 500)

# Some cleaning/encoding options
knn_neighbors=5
one_hot_drop, collinearity='first', 'no-multicollinearity'

# Base data: ID column set as index, missing string placeholders converted to nan
# ordinal categorical features translated to numerical categorical where possible
translated_features_file=f'{config.DATA_PATH}/processed/01.1-features_translated.pkl'

# Dummy encoded data with with NAN values encoded as missing for 
# true categorical features and KNN imputed for numerical features
encoded_missing_imputed_data_df_file=f'{config.DATA_PATH}/processed/02.1-{collinearity}_encoded_missing_imputed_data_df.parquet'

# Dummy encoded data with NANs filled in by KNN imputation for all features
encoded_all_imputed_data_df_file=f'{config.DATA_PATH}/processed/02.1-{collinearity}_encoded_all_imputed_data_df.parquet'

# Feature info files
feature_types_dict_file=f'{config.DATA_PATH}/processed/01.1-feature_type_dict.pkl'

# Model files
knn_imputer_numerical_features_file=f'{config.MODELS_PATH}/02.1-KNN_imputer_numerical_features.pkl'
knn_imputer_categorical_features_file=f'{config.MODELS_PATH}/02.1-KNN_imputer_categorical_features.pkl'
one_hot_encoder_nan_encoded_file=f'{config.MODELS_PATH}/02.1-{collinearity}_one_hot_encoder_nan_encoded.pkl'
one_hot_encoder_nan_imputed_file=f'{config.MODELS_PATH}/02.1-{collinearity}_one_hot_encoder_nan_imputed.pkl'

with open(feature_types_dict_file, 'rb') as input_file:
    feature_types_dict=pickle.load(input_file)

print('Feature types:\n')

for feature_type, features in feature_types_dict.items():
    print(f'{feature_type}\n{features}\n')

Feature types:

Interval
['donor_age', 'age_at_hct', 'year_hct']

Ordinal
['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'hla_match_a_high', 'hla_match_b_low', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10', 'dri_score', 'cyto_score', 'cmv_status', 'cyto_score_detail']

Nominal
['psych_disturb', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'tce_imm_match', 'rituximab', 'prod_type', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']

Labels
['efs', 'e

## 1. Base training data

In [2]:
# Load the datasets
data_df=pd.read_parquet(translated_features_file)
data_df.head().transpose()

ID,0,1,2,3,4
dri_score,0,2,0,4,4
psych_disturb,No,No,No,No,No
cyto_score,,3,,3,
diabetes,No,No,No,No,No
hla_match_c_high,,2.0,2.0,2.0,2.0
hla_high_res_8,,8.0,8.0,8.0,8.0
tbi_status,,>cGy,,,
arrhythmia,No,No,No,No,No
hla_low_res_6,6.0,6.0,6.0,6.0,6.0
graft_type,Marrow,Blood,Marrow,Marrow,Blood


In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28800 entries, 0 to 28799
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   dri_score               26362 non-null  object 
 1   psych_disturb           26592 non-null  object 
 2   cyto_score              19336 non-null  object 
 3   diabetes                26540 non-null  object 
 4   hla_match_c_high        24180 non-null  float64
 5   hla_high_res_8          22971 non-null  float64
 6   tbi_status              28724 non-null  object 
 7   arrhythmia              26480 non-null  object 
 8   hla_low_res_6           25530 non-null  float64
 9   graft_type              28800 non-null  object 
 10  vent_hist               28541 non-null  object 
 11  renal_issue             26748 non-null  object 
 12  pulm_severe             26485 non-null  object 
 13  prim_disease_hct        28800 non-null  object 
 14  hla_high_res_6          23516 non-null  flo

In [4]:
data_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hla_match_c_high,24180.0,1.764516,0.431941,0.0,2.0,2.0,2.0,2.0
hla_high_res_8,22971.0,6.876801,1.564313,2.0,6.0,8.0,8.0,8.0
hla_low_res_6,25530.0,5.143322,1.207757,2.0,4.0,6.0,6.0,6.0
hla_high_res_6,23516.0,5.109202,1.214162,0.0,4.0,6.0,6.0,6.0
hla_high_res_10,21637.0,8.61723,1.905125,3.0,7.0,10.0,10.0,10.0
hla_match_dqb1_high,23601.0,1.736876,0.447687,0.0,1.0,2.0,2.0,2.0
hla_nmdp_6,24603.0,5.160346,1.20324,2.0,4.0,6.0,6.0,6.0
hla_match_c_low,26000.0,1.757808,0.435453,0.0,2.0,2.0,2.0,2.0
hla_match_drb1_low,26157.0,1.715296,0.451282,1.0,1.0,2.0,2.0,2.0
hla_match_dqb1_low,24606.0,1.773795,0.42713,0.0,2.0,2.0,2.0,2.0


## 2. Encoded features

### 2.1. Encode NAN as missing, KNN impute numerical values

In [5]:
print(f'Data: {data_df.shape}')

# Remove the labels
labels=data_df[['efs', 'efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Split the data into interval/ordinal and nominal
numerical_features_df=data_df[feature_types_dict['Ordinal']+feature_types_dict['Interval']]
categorical_features_df=data_df[feature_types_dict['Nominal']].copy()
print(f'Numerical data: {numerical_features_df.shape}')
print(f'Categorical data: {categorical_features_df.shape}')

# Translate nan in categorical data to 'missing'
categorical_features_df.replace({np.nan: 'missing'}, inplace=True)

# Encode the features
encoder=OneHotEncoder(drop=one_hot_drop, min_frequency=5, handle_unknown='infrequent_if_exist', sparse_output=False)
encoded_feature_data=encoder.fit_transform(categorical_features_df)

# Save the one-hot encoder for later
with open(one_hot_encoder_nan_encoded_file, 'wb') as output_file:
    pickle.dump(encoder, output_file)

# Rebuild the dataframe
categorical_features_df=pd.DataFrame(
    encoded_feature_data,
    columns=encoder.get_feature_names_out()
)

categorical_features_df.set_index(data_df.index, inplace=True)
print(f'Encoded categorical data: {categorical_features_df.shape}')

# Impute missing values in the numerical features
imputer=KNNImputer(n_neighbors=knn_neighbors, weights='uniform')
imputed_numerical_features=imputer.fit_transform(numerical_features_df)

# Save the imputer for later
with open(knn_imputer_numerical_features_file, 'wb') as output_file:
    pickle.dump(imputer, output_file)

# Re-build dataframes
numerical_features_df=pd.DataFrame(
    imputed_numerical_features, 
    columns=numerical_features_df.columns
)

numerical_features_df.set_index(data_df.index, inplace=True)
print(f'Imputed numerical data: {numerical_features_df.shape}')

# Set the types
categorical_features_df=categorical_features_df.astype('int32').copy()
numerical_features_df=numerical_features_df.astype('float64').copy()
numerical_features_df['year_hct']=numerical_features_df['year_hct'].astype('int32').copy()

# Join categorical and numerical data
data_df=pd.concat([numerical_features_df, categorical_features_df], axis=1)
print(f'Re-combined data: {data_df.shape}')

# Add back the labels
data_df=pd.concat([data_df, labels], axis=1)
print(f'Labeled re-combined data: {data_df.shape}\n')

# Save it
data_df.to_parquet(encoded_missing_imputed_data_df_file)

Data: (28800, 59)
Numerical data: (28800, 26)
Categorical data: (28800, 31)
Encoded categorical data: (28800, 114)
Imputed numerical data: (28800, 26)
Re-combined data: (28800, 140)
Labeled re-combined data: (28800, 142)



In [6]:
data_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 28800 entries, 0 to 28799
Data columns (total 142 columns):
 #    Column                          Non-Null Count  Dtype  
---   ------                          --------------  -----  
 0    hla_match_c_high                28800 non-null  float64
 1    hla_high_res_8                  28800 non-null  float64
 2    hla_low_res_6                   28800 non-null  float64
 3    hla_high_res_6                  28800 non-null  float64
 4    hla_high_res_10                 28800 non-null  float64
 5    hla_match_dqb1_high             28800 non-null  float64
 6    hla_nmdp_6                      28800 non-null  float64
 7    hla_match_c_low                 28800 non-null  float64
 8    hla_match_drb1_low              28800 non-null  float64
 9    hla_match_dqb1_low              28800 non-null  float64
 10   hla_match_a_high                28800 non-null  float64
 11   hla_match_b_low                 28800 non-null  float64
 12   hla_match_a_low      

In [7]:
data_df.head().transpose()

ID,0,1,2,3,4
hla_match_c_high,2.0,2.0,2.0,2.0,2.0
hla_high_res_8,8.0,8.0,8.0,8.0,8.0
hla_low_res_6,6.0,6.0,6.0,6.0,6.0
hla_high_res_6,6.0,6.0,6.0,6.0,6.0
hla_high_res_10,10.0,10.0,10.0,10.0,10.0
hla_match_dqb1_high,2.0,2.0,2.0,2.0,2.0
hla_nmdp_6,6.0,6.0,6.0,6.0,5.0
hla_match_c_low,2.0,2.0,2.0,2.0,2.0
hla_match_drb1_low,2.0,2.0,2.0,2.0,2.0
hla_match_dqb1_low,2.0,2.0,2.0,2.0,2.0


### 2.2. Encode, then impute everything

In [8]:
# Load the datasets
data_df=pd.read_parquet(translated_features_file)
print(f'Data: {data_df.shape}')

# Remove the labels
labels=data_df[['efs', 'efs_time']]
data_df.drop(['efs','efs_time'], axis=1, inplace=True)

# Split the data into numerical/ordinal and true categorical
numerical_features_df=data_df[feature_types_dict['Ordinal']+feature_types_dict['Interval']].copy()
categorical_features_df=data_df[feature_types_dict['Nominal']].copy()
print(f'Numerical data: {numerical_features_df.shape}')
print(f'Categorical data: {categorical_features_df.shape}')

# Label encode categorical features, preserving nans
translation_dicts={}

for feature in categorical_features_df.columns:

    feature_level_counts=categorical_features_df[feature].value_counts()
    translation_dict={}

    for i, level in enumerate(feature_level_counts.index):
        translation_dict[level]=str(i)

    categorical_features_df[feature]=categorical_features_df[feature].replace(translation_dict)
    translation_dicts[feature]=translation_dict

# Impute missing values categorical features
imputer=KNNImputer(n_neighbors=knn_neighbors, weights='uniform')
imputed_categorical_features=imputer.fit_transform(categorical_features_df)

# Save the imputer for later
with open(knn_imputer_categorical_features_file, 'wb') as output_file:
    pickle.dump(imputer, output_file)

# Re-build dataframe
categorical_features_df=pd.DataFrame(
    imputed_categorical_features, 
    columns=categorical_features_df.columns
)

categorical_features_df.set_index(data_df.index, inplace=True)

# Round to nearest int
categorical_features_df=categorical_features_df.map(lambda x: int(round(x, 0)) if isinstance(x, (int, float)) else x)
print(f'Imputed categorical data: {categorical_features_df.shape}')

# Get categories back
for feature in categorical_features_df.columns:
    translation_dict={int(value): key for key, value in translation_dicts[feature].items()}
    categorical_features_df[feature]=categorical_features_df[feature].replace(translation_dict)

# Encode the categorical features
encoder=OneHotEncoder(drop=one_hot_drop, min_frequency=5, handle_unknown='infrequent_if_exist', sparse_output=False)
encoded_imputed_categorical_features=encoder.fit_transform(categorical_features_df)

# Save the one-hot encoder for later
with open(one_hot_encoder_nan_imputed_file, 'wb') as output_file:
    pickle.dump(encoder, output_file)

# Re-build dataframe
categorical_features_df=pd.DataFrame(
    encoded_imputed_categorical_features, 
    columns=encoder.get_feature_names_out()
)

categorical_features_df.set_index(data_df.index, inplace=True)
print(f'Encoded imputed categorical data: {categorical_features_df.shape}')

# Impute missing values in the numerical features
imputer=KNNImputer(n_neighbors=knn_neighbors, weights='uniform')
imputed_numerical_features=imputer.fit_transform(numerical_features_df)

# Re-build dataframes
numerical_features_df=pd.DataFrame(
    imputed_numerical_features, 
    columns=numerical_features_df.columns
)

numerical_features_df.set_index(data_df.index, inplace=True)
print(f'Imputed numerical data: {numerical_features_df.shape}')

# Set the types
categorical_features_df=categorical_features_df.astype('int32').copy()
numerical_features_df=numerical_features_df.astype('float64').copy()
numerical_features_df['year_hct']=numerical_features_df['year_hct'].astype('int32').copy()

# Join categorical and numerical data
data_df=pd.concat([numerical_features_df, categorical_features_df], axis=1)
print(f'Re-combined data: {data_df.shape}')

# Add back the labels
data_df=pd.concat([data_df, labels], axis=1)
print(f'Labeled re-combined data: {data_df.shape}\n')

# Save it
data_df.to_parquet(encoded_all_imputed_data_df_file)

Data: (28800, 59)
Numerical data: (28800, 26)
Categorical data: (28800, 31)
Imputed categorical data: (28800, 31)
Encoded imputed categorical data: (28800, 87)
Imputed numerical data: (28800, 26)
Re-combined data: (28800, 113)
Labeled re-combined data: (28800, 115)



In [9]:
data_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 28800 entries, 0 to 28799
Data columns (total 115 columns):
 #    Column                         Non-Null Count  Dtype  
---   ------                         --------------  -----  
 0    hla_match_c_high               28800 non-null  float64
 1    hla_high_res_8                 28800 non-null  float64
 2    hla_low_res_6                  28800 non-null  float64
 3    hla_high_res_6                 28800 non-null  float64
 4    hla_high_res_10                28800 non-null  float64
 5    hla_match_dqb1_high            28800 non-null  float64
 6    hla_nmdp_6                     28800 non-null  float64
 7    hla_match_c_low                28800 non-null  float64
 8    hla_match_drb1_low             28800 non-null  float64
 9    hla_match_dqb1_low             28800 non-null  float64
 10   hla_match_a_high               28800 non-null  float64
 11   hla_match_b_low                28800 non-null  float64
 12   hla_match_a_low                2880

In [10]:
data_df.head().transpose()

ID,0,1,2,3,4
hla_match_c_high,2.0,2.0,2.0,2.0,2.0
hla_high_res_8,8.0,8.0,8.0,8.0,8.0
hla_low_res_6,6.0,6.0,6.0,6.0,6.0
hla_high_res_6,6.0,6.0,6.0,6.0,6.0
hla_high_res_10,10.0,10.0,10.0,10.0,10.0
hla_match_dqb1_high,2.0,2.0,2.0,2.0,2.0
hla_nmdp_6,6.0,6.0,6.0,6.0,5.0
hla_match_c_low,2.0,2.0,2.0,2.0,2.0
hla_match_drb1_low,2.0,2.0,2.0,2.0,2.0
hla_match_dqb1_low,2.0,2.0,2.0,2.0,2.0
