# HSCT survival: data cleaning and encoding

## 1. Notebook set-up

### 1.1. Imports & options

In [1]:
# PyPI imports
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import train_test_split, ShuffleSplit, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier

# Internal imports
import configuration as config
import functions.encoding as encode_funcs
import functions.helper as helper_funcs

pd.set_option('display.max_rows', 500)

### 1.2. Parameters

In [2]:
# Run options
rebuild_datasets=True

# Some cleaning/encoding options
knn_neighbors=5
one_hot_drop, collinearity='first', 'no-multicollinearity'

### 1.3. Files

In [3]:
# Base data input file: ID column set as index, missing string placeholders converted to nan
# ordinal categorical features translated to numerical categorical where possible
translated_features_file=f'{config.PROCESSED_DATA}/01.1-features_translated.pkl'

# Feature data type definition file
feature_types_dict_file=f'{config.PROCESSED_DATA}/01.1-feature_type_dict.pkl'

# Nominal and ordinal features ordinal encoded.
# NAN values encoded as 'missing' for categorical features
# and KNN imputed for numerical features.
ordinal_all_nan_encoded_data_df_file=f'{config.PROCESSED_DATA}/02.1-{collinearity}_ordinal_all_nan_encoded_data_df.pkl'

# Nominal and ordinal features ordinal encoded encoded.
# NANs filled in by KNN imputation for all features.
ordinal_all_nan_imputed_data_df_file=f'{config.PROCESSED_DATA}/02.1-{collinearity}_ordinal_all_nan_imputed_data_df.pkl'

# Nominal features one hot encoded, ordinal features ordinal encoded. 
# Data with with NAN values encoded as missing for categorical features 
# and KNN imputed for numerical features.
one_hot_ordinal_nan_encoded_data_df_file=f'{config.PROCESSED_DATA}/02.1-{collinearity}_one_hot_ordinal_nan_encoded_data_df.pkl'

# Nominal features one hot encoded, ordinal features ordinal encoded.
# NANs filled in by KNN imputation for all features.
one_hot_ordinal_nan_imputed_data_df_file=f'{config.PROCESSED_DATA}/02.1-{collinearity}_one_hot_ordinal_nan_imputed_data_df.pkl'

# All ordinal and nominal features target encoded on EFS
binary_target_encoded_data_file=f'{config.PROCESSED_DATA}/02.1-binary_target_encoded_data_df.pkl'

# All ordinal and nominal features target encoded on EFS time
continuous_target_encoded_data_file=f'{config.PROCESSED_DATA}/02.1-continuous_target_encoded_data_df.pkl'

# Save dataset definitions
datasets={
    'Nominal one-hot/ordinal encoded, NANs encoded':one_hot_ordinal_nan_encoded_data_df_file,
    'Nominal one-hot/ordinal encoded, NANs imputed':one_hot_ordinal_nan_imputed_data_df_file,
    'All ordinal encoded, NAN encoded':ordinal_all_nan_encoded_data_df_file,
    'All ordinal encoded, NAN imputed':ordinal_all_nan_imputed_data_df_file,
    'Binary target encoded':binary_target_encoded_data_file,
    'Continuous target encoded':continuous_target_encoded_data_file
}

# Dataset definition file
datasets_file=f'{config.PROCESSED_DATA}/02.1-dataset_definitions.pkl'

# Save the dataset metadata
with open(datasets_file, 'wb') as output_file:
    pickle.dump(datasets, output_file)

## 2. Input data

### 2.1. Feature type definitions

In [4]:
# Load the feature data type definitions
with open(feature_types_dict_file, 'rb') as input_file:
    feature_types_dict=pickle.load(input_file)

print('Feature types:\n')

for feature_type, features in feature_types_dict.items():
    print(f'{feature_type}\n{features}\n')

Feature types:

Interval
['donor_age', 'age_at_hct']

Ordinal
['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'hla_match_b_low', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10', 'dri_score', 'cyto_score', 'cmv_status', 'cyto_score_detail']

Nominal
['psych_disturb', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'tce_imm_match', 'rituximab', 'prod_type', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']

Labels
['efs', 'e

### 2.2. Data

In [5]:
if rebuild_datasets == True:
    
    # Load the dataset
    data_df=pd.read_parquet(translated_features_file)

    # Train test split
    master_training_df, master_testing_df=train_test_split(data_df, test_size=0.3, random_state=315)

    # Save the un-encoded race group so we can use it in C-index scoring later
    training_race_group=master_training_df['race_group']
    testing_race_group=master_testing_df['race_group']

    # Save and remove the IDs
    training_ids=master_training_df.index
    master_training_df.reset_index(drop=True, inplace=True)
    testing_ids=master_testing_df.index
    master_testing_df.reset_index(drop=True, inplace=True)

    # Remove the labels
    master_training_labels_df=master_training_df[['efs', 'efs_time']].copy()
    master_training_features_df=master_training_df.drop(['efs', 'efs_time'], axis=1)
    master_testing_labels_df=master_testing_df[['efs', 'efs_time']].copy()
    master_testing_features_df=master_testing_df.drop(['efs', 'efs_time'], axis=1)

## 3. Encoding and NAN handling schemes

### 3.1. One-hot encode nominal features with missing value string

In [6]:
if rebuild_datasets == True:
    
    # Make a copy of the master input testing and training features
    training_df=master_training_features_df.copy()
    testing_df=master_testing_features_df.copy()

    # Encode the nominal features
    nominal_training_df, nominal_testing_df=encode_funcs.one_hot_nan_encoded(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Nominal'],
        models_path=config.MODELS_PATH
    )

    # Encode the ordinal features
    ordinal_training_df, ordinal_testing_df=encode_funcs.ordinal_encode_nan_imputed(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Ordinal'],
        models_path=config.MODELS_PATH
    )

    # Clean NANs in the interval features
    interval_training_df, interval_testing_df=encode_funcs.impute_numerical_features(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Interval'],
        models_path=config.MODELS_PATH
    )

    # Join the data back together
    training_df=pd.concat([nominal_training_df, ordinal_training_df, interval_training_df], axis=1)
    testing_df=pd.concat([nominal_testing_df, ordinal_testing_df, interval_testing_df], axis=1)
    print(f'Re-combined data: {training_df.shape}\n')

    # Assemble dataset dictionary
    dataset={
        'Training features': training_df,
        'Training labels': master_training_labels_df,
        'Training IDs': list(training_ids.values),
        'Training race group': list(training_race_group.values),
        'Testing features': testing_df,
        'Testing labels': master_testing_labels_df,
        'Testing IDs': list(testing_ids.values),
        'Testing race group': list(testing_race_group.values),
    }

    print('Data dictionary contains:')
    for key, value in dataset.items():
        print(f' {key}: {type(value)}')

    # Save
    with open(one_hot_ordinal_nan_encoded_data_df_file, 'wb') as output_file:
        pickle.dump(dataset, output_file)

    # Inspect
    print()
    training_df.info(verbose=True, show_counts=True)


One-hot encoding input data: (20160, 57)
Feature data: (20160, 31)
One-hot encoded feature data: (8640, 114)

Ordinal encoding input data: (20160, 57)
Feature data: (20160, 24)
Ordinal encoded feature data: (20160, 24)
Imputed, ordinal encoded feature data: (20160, 24)

Imputation input data: (20160, 22)
Imputed numerical data: (20160, 2)
Re-combined data: (20160, 140)

Data dictionary contains:
 Training features: <class 'pandas.core.frame.DataFrame'>
 Training labels: <class 'pandas.core.frame.DataFrame'>
 Training IDs: <class 'list'>
 Training race group: <class 'list'>
 Testing features: <class 'pandas.core.frame.DataFrame'>
 Testing labels: <class 'pandas.core.frame.DataFrame'>
 Testing IDs: <class 'list'>
 Testing race group: <class 'list'>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20160 entries, 0 to 20159
Data columns (total 140 columns):
 #    Column                                                Non-Null Count  Dtype  
---   ------                                   

### 3.2. One-hot encode nominal features with missing value imputation

In [7]:
if rebuild_datasets == True:

    # Make a copy of the master input testing and training features
    training_df=master_training_features_df.copy()
    testing_df=master_testing_features_df.copy()

    # Encode the nominal features
    nominal_training_df, nominal_testing_df=encode_funcs.one_hot_encode_nan_imputed(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Nominal'],
        models_path=config.MODELS_PATH
    )

    # Encode the ordinal features
    ordinal_training_df, ordinal_testing_df=encode_funcs.ordinal_encode_nan_imputed(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Ordinal'],
        models_path=config.MODELS_PATH
    )

    # Clean NANs in the interval features
    interval_training_df, interval_testing_df=encode_funcs.impute_numerical_features(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Interval'],
        models_path=config.MODELS_PATH
    )

    # Join the data back together
    training_df=pd.concat([nominal_training_df, ordinal_training_df, interval_training_df], axis=1)
    testing_df=pd.concat([nominal_testing_df, ordinal_testing_df, interval_testing_df], axis=1)
    print(f'Re-combined data: {training_df.shape}\n')

    # Assemble dataset dictionary
    dataset={
        'Training features': training_df,
        'Training labels': master_training_labels_df,
        'Training IDs': list(training_ids.values),
        'Training race group': list(training_race_group.values),
        'Testing features': testing_df,
        'Testing labels': master_testing_labels_df,
        'Testing IDs': list(testing_ids.values),
        'Testing race group': list(testing_race_group.values),
    }

    print('Data dictionary contains:')
    for key, value in dataset.items():
        print(f' {key}: {type(value)}')

    # Save
    with open(one_hot_ordinal_nan_imputed_data_df_file, 'wb') as output_file:
        pickle.dump(dataset, output_file)

    # Inspect
    print()
    training_df.info(verbose=True, show_counts=True)


One-hot encoding input data: (20160, 57)
Feature data: (20160, 31)
On-hot encoded, imputed feature data: (20160, 114)

Ordinal encoding input data: (20160, 57)
Feature data: (20160, 24)
Ordinal encoded feature data: (20160, 24)
Imputed, ordinal encoded feature data: (20160, 24)

Imputation input data: (20160, 22)
Imputed numerical data: (20160, 2)
Re-combined data: (20160, 140)

Data dictionary contains:
 Training features: <class 'pandas.core.frame.DataFrame'>
 Training labels: <class 'pandas.core.frame.DataFrame'>
 Training IDs: <class 'list'>
 Training race group: <class 'list'>
 Testing features: <class 'pandas.core.frame.DataFrame'>
 Testing labels: <class 'pandas.core.frame.DataFrame'>
 Testing IDs: <class 'list'>
 Testing race group: <class 'list'>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20160 entries, 0 to 20159
Data columns (total 140 columns):
 #    Column                         Non-Null Count  Dtype  
---   ------                         --------------  -----  


### 3.3. Ordinal encode nominal and ordinal features with 'missing' level

In [8]:
if rebuild_datasets == True:
    
    # Make a copy of the master input testing and training features
    training_df=master_training_features_df.copy()
    testing_df=master_testing_features_df.copy()

    # Encode the nominal & ordinal features
    categorical_training_df, categorical_testing_df=encode_funcs.ordinal_encode_nan_encoded(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Nominal'] + feature_types_dict['Ordinal'],
        models_path=config.MODELS_PATH
    )

    # Clean NANs in the interval features
    interval_training_df, interval_testing_df=encode_funcs.impute_numerical_features(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Interval'],
        models_path=config.MODELS_PATH
    )

    # Join the data back together
    training_df=pd.concat([categorical_training_df, interval_training_df], axis=1)
    testing_df=pd.concat([categorical_testing_df, interval_testing_df], axis=1)
    print(f'Re-combined data: {training_df.shape}\n')

    # Assemble dataset dictionary
    dataset={
        'Training features': training_df,
        'Training labels': master_training_labels_df,
        'Training IDs': list(training_ids.values),
        'Training race group': list(training_race_group.values),
        'Testing features': testing_df,
        'Testing labels': master_testing_labels_df,
        'Testing IDs': list(testing_ids.values),
        'Testing race group': list(testing_race_group.values),
    }

    print('Data dictionary contains:')
    for key, value in dataset.items():
        print(f' {key}: {type(value)}')

    # Save
    with open(ordinal_all_nan_encoded_data_df_file, 'wb') as output_file:
        pickle.dump(dataset, output_file)

    # Inspect
    print()
    training_df.info(verbose=True, show_counts=True)


Ordinal encoding input data: (20160, 57)
Feature data: (20160, 55)
Ordinal encoded feature data: (20160, 55)

Imputation input data: (20160, 22)
Imputed numerical data: (20160, 2)
Re-combined data: (20160, 57)

Data dictionary contains:
 Training features: <class 'pandas.core.frame.DataFrame'>
 Training labels: <class 'pandas.core.frame.DataFrame'>
 Training IDs: <class 'list'>
 Training race group: <class 'list'>
 Testing features: <class 'pandas.core.frame.DataFrame'>
 Testing labels: <class 'pandas.core.frame.DataFrame'>
 Testing IDs: <class 'list'>
 Testing race group: <class 'list'>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20160 entries, 0 to 20159
Data columns (total 57 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   psych_disturb           20160 non-null  int32  
 1   diabetes                20160 non-null  int32  
 2   tbi_status              20160 non-null  int32  
 3   arrhythmia              

### 3.4. Ordinal encode nominal and ordinal features with NAN imputation

In [9]:
if rebuild_datasets == True:

    # Make a copy of the master input testing and training features
    training_df=master_training_features_df.copy()
    testing_df=master_testing_features_df.copy()

    # Encode the nominal & ordinal features
    categorical_training_df, categorical_testing_df=encode_funcs.ordinal_encode_nan_imputed(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Nominal'] + feature_types_dict['Ordinal'],
        models_path=config.MODELS_PATH
    )

    # Clean NANs in the interval features
    interval_training_df, interval_testing_df=encode_funcs.impute_numerical_features(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Interval'],
        models_path=config.MODELS_PATH
    )

    # Join the data back together
    training_df=pd.concat([categorical_training_df, interval_training_df], axis=1)
    testing_df=pd.concat([categorical_testing_df, interval_testing_df], axis=1)
    print(f'Re-combined data: {training_df.shape}\n')

    # Assemble dataset dictionary
    dataset={
        'Training features': training_df,
        'Training labels': master_training_labels_df,
        'Training IDs': list(training_ids.values),
        'Training race group': list(training_race_group.values),
        'Testing features': testing_df,
        'Testing labels': master_testing_labels_df,
        'Testing IDs': list(testing_ids.values),
        'Testing race group': list(testing_race_group.values),
    }

    print('Data dictionary contains:')
    for key, value in dataset.items():
        print(f' {key}: {type(value)}')

    # Save
    with open(ordinal_all_nan_imputed_data_df_file, 'wb') as output_file:
        pickle.dump(dataset, output_file)

    # Inspect
    print()
    training_df.info(verbose=True, show_counts=True)


Ordinal encoding input data: (20160, 57)
Feature data: (20160, 55)
Ordinal encoded feature data: (20160, 55)
Imputed, ordinal encoded feature data: (20160, 55)

Imputation input data: (20160, 22)
Imputed numerical data: (20160, 2)
Re-combined data: (20160, 57)

Data dictionary contains:
 Training features: <class 'pandas.core.frame.DataFrame'>
 Training labels: <class 'pandas.core.frame.DataFrame'>
 Training IDs: <class 'list'>
 Training race group: <class 'list'>
 Testing features: <class 'pandas.core.frame.DataFrame'>
 Testing labels: <class 'pandas.core.frame.DataFrame'>
 Testing IDs: <class 'list'>
 Testing race group: <class 'list'>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20160 entries, 0 to 20159
Data columns (total 57 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   psych_disturb           20160 non-null  int32  
 1   diabetes                20160 non-null  int32  
 2   tbi_status              20

### 3.5. Binary target encode everything on efs

In [10]:
if rebuild_datasets == True:

    # Make a copy of the master input testing and training features
    training_df=master_training_features_df.copy()
    testing_df=master_testing_features_df.copy()

    # Get categorical features
    categorical_training_df=training_df[feature_types_dict['Nominal'] + feature_types_dict['Ordinal']]
    categorical_testing_df=testing_df[feature_types_dict['Nominal'] + feature_types_dict['Ordinal']]
    
    # Encode the nominal & ordinal features
    encoder=TargetEncoder()
    encoder.fit(categorical_training_df, master_training_labels_df['efs'])
    encoded_categorical_training_features=encoder.transform(categorical_training_df)
    encoded_categorical_testing_features=encoder.transform(categorical_testing_df)

    # Save the encoder
    with open(f'{config.MODELS_PATH}/01.2-binary_target_encoder.pkl', 'wb') as output_file:
        pickle.dump(encoder, output_file)

    # Rebuild the dataframes
    encoded_categorical_training_features_df=pd.DataFrame(
        encoded_categorical_training_features,
        columns=feature_types_dict['Nominal'] + feature_types_dict['Ordinal']
    )

    encoded_categorical_testing_features_df=pd.DataFrame(
        encoded_categorical_testing_features,
        columns=feature_types_dict['Nominal'] + feature_types_dict['Ordinal']
    )

    # Clean NANs in the interval features
    interval_training_df, interval_testing_df=encode_funcs.impute_numerical_features(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Interval'],
        models_path=config.MODELS_PATH
    )

    # Join the data back together
    training_df=pd.concat([encoded_categorical_training_features_df, interval_training_df], axis=1)
    testing_df=pd.concat([encoded_categorical_testing_features_df, interval_testing_df], axis=1)
    print(f'Re-combined data: {training_df.shape}\n')

    # Assemble dataset dictionary
    dataset={
        'Training features': training_df,
        'Training labels': master_training_labels_df,
        'Training IDs': list(training_ids.values),
        'Training race group': list(training_race_group.values),
        'Testing features': testing_df,
        'Testing labels': master_testing_labels_df,
        'Testing IDs': list(testing_ids.values),
        'Testing race group': list(testing_race_group.values),
    }

    print('Data dictionary contains:')
    for key, value in dataset.items():
        print(f' {key}: {type(value)}')

    # Save
    with open(binary_target_encoded_data_file, 'wb') as output_file:
        pickle.dump(dataset, output_file)

    # Inspect
    print()
    training_df.info(verbose=True, show_counts=True)


Imputation input data: (20160, 22)
Imputed numerical data: (20160, 2)
Re-combined data: (20160, 57)

Data dictionary contains:
 Training features: <class 'pandas.core.frame.DataFrame'>
 Training labels: <class 'pandas.core.frame.DataFrame'>
 Training IDs: <class 'list'>
 Training race group: <class 'list'>
 Testing features: <class 'pandas.core.frame.DataFrame'>
 Testing labels: <class 'pandas.core.frame.DataFrame'>
 Testing IDs: <class 'list'>
 Testing race group: <class 'list'>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20160 entries, 0 to 20159
Data columns (total 57 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   psych_disturb           20160 non-null  float64
 1   diabetes                20160 non-null  float64
 2   tbi_status              20160 non-null  float64
 3   arrhythmia              20160 non-null  float64
 4   graft_type              20160 non-null  float64
 5   vent_hist               2016

### 3.6. Continuous target encode everything on efs_time

In [11]:
if rebuild_datasets == True:

    # Make a copy of the master input testing and training features
    training_df=master_training_features_df.copy()
    testing_df=master_testing_features_df.copy()

    # Get categorical features
    categorical_training_df=training_df[feature_types_dict['Nominal'] + feature_types_dict['Ordinal']]
    categorical_testing_df=testing_df[feature_types_dict['Nominal'] + feature_types_dict['Ordinal']]
    
    # Encode the nominal & ordinal features
    encoder=TargetEncoder()
    encoder.fit(categorical_training_df, master_training_labels_df['efs_time'])
    encoded_categorical_training_features=encoder.transform(categorical_training_df)
    encoded_categorical_testing_features=encoder.transform(categorical_testing_df)

    # Save the encoder
    with open(f'{config.MODELS_PATH}/01.2-binary_target_encoder.pkl', 'wb') as output_file:
        pickle.dump(encoder, output_file)

    # Rebuild the dataframes
    encoded_categorical_training_features_df=pd.DataFrame(
        encoded_categorical_training_features,
        columns=feature_types_dict['Nominal'] + feature_types_dict['Ordinal']
    )

    encoded_categorical_testing_features_df=pd.DataFrame(
        encoded_categorical_testing_features,
        columns=feature_types_dict['Nominal'] + feature_types_dict['Ordinal']
    )

    # Clean NANs in the interval features
    interval_training_df, interval_testing_df=encode_funcs.impute_numerical_features(
        training_df=training_df,
        testing_df=testing_df,
        features=feature_types_dict['Interval'],
        models_path=config.MODELS_PATH
    )

    # Join the data back together
    training_df=pd.concat([encoded_categorical_training_features_df, interval_training_df], axis=1)
    testing_df=pd.concat([encoded_categorical_testing_features_df, interval_testing_df], axis=1)
    print(f'Re-combined data: {training_df.shape}\n')

    # Assemble dataset dictionary
    dataset={
        'Training features': training_df,
        'Training labels': master_training_labels_df,
        'Training IDs': list(training_ids.values),
        'Training race group': list(training_race_group.values),
        'Testing features': testing_df,
        'Testing labels': master_testing_labels_df,
        'Testing IDs': list(testing_ids.values),
        'Testing race group': list(testing_race_group.values),
    }

    print('Data dictionary contains:')
    for key, value in dataset.items():
        print(f' {key}: {type(value)}')

    # Save
    with open(continuous_target_encoded_data_file, 'wb') as output_file:
        pickle.dump(dataset, output_file)

    # Inspect
    print()
    training_df.info(verbose=True, show_counts=True)


Imputation input data: (20160, 22)
Imputed numerical data: (20160, 2)
Re-combined data: (20160, 57)

Data dictionary contains:
 Training features: <class 'pandas.core.frame.DataFrame'>
 Training labels: <class 'pandas.core.frame.DataFrame'>
 Training IDs: <class 'list'>
 Training race group: <class 'list'>
 Testing features: <class 'pandas.core.frame.DataFrame'>
 Testing labels: <class 'pandas.core.frame.DataFrame'>
 Testing IDs: <class 'list'>
 Testing race group: <class 'list'>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20160 entries, 0 to 20159
Data columns (total 57 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   psych_disturb           20160 non-null  float64
 1   diabetes                20160 non-null  float64
 2   tbi_status              20160 non-null  float64
 3   arrhythmia              20160 non-null  float64
 4   graft_type              20160 non-null  float64
 5   vent_hist               2016