# HSCT survival: data cleaning and encoding

## Notebook set-up

In [None]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer

import configuration as config

# Input data files
dictionary_file=f'{config.DATA_PATH}/raw/data_dictionary.csv'
training_file=f'{config.DATA_PATH}/raw/train.csv'
testing_file=f'{config.DATA_PATH}/raw/test.csv'

# Training data with no manipulations of any kind, except ID column set as index
# and missing string placeholders converted to nan
training_features_df_file=f'{config.DATA_PATH}/processed/02.1-training_features_df.parquet'
training_labels_df_file=f'{config.DATA_PATH}/processed/02.1-training_labels_df.parquet'

# Dummy encoded data with no other manipulations
encoded_training_features_df_file=f'{config.DATA_PATH}/processed/02.1-encoded_training_features_df.parquet'
encoded_training_labels_df_file=f'{config.DATA_PATH}/processed/02.1-encoded_training_labels_df.parquet'

# Dummy encoded data with NAN rows dropped
encoded_cleaned_training_features_df_file=f'{config.DATA_PATH}/processed/02.1-encoded_cleaned_training_features_df.parquet'
encoded_cleaned_training_labels_df_file=f'{config.DATA_PATH}/processed/02.1-encoded_cleaned_training_labels_df.parquet'

# Dummy encoded data with with NAN values encoded as missing for 
# true categorical features and KNN imputed for numerical features
encoded_missing_imputed_training_features_df_file=f'{config.DATA_PATH}/processed/02.1-encoded_missing_imputed_training_features_df.parquet'
encoded_missing_imputed_training_labels_df_file=f'{config.DATA_PATH}/processed/02.1-encoded_missing_imputed_training_labels_df.parquet'

# Dummy encoded data with NANs filled in by KNN imputation for all features
encoded_all_imputed_training_features_df_file=f'{config.DATA_PATH}/processed/02.1-encoded_all_imputed_training_features_df.parquet'
encoded_all_imputed_training_labels_df_file=f'{config.DATA_PATH}/processed/02.1-encoded_all_imputed_training_labels_df.parquet'

# Feature info files
feature_types_dict_file=f'{config.DATA_PATH}/processed/01.1-feature_type_dict.pkl'
feature_value_translation_dicts_file=f'{config.DATA_PATH}/processed/01.1-feature_value_translation_dicts.pkl'
nan_placeholders_dict_file=f'{config.DATA_PATH}/processed/01.1-nan_placeholders_list.pkl'

# Model files
knn_imputer_file=f'{config.MODELS_PATH}/02.1-KNN_imputer.pkl'
one_hot_encoder_all_features_raw_file=f'{config.MODELS_PATH}/02.1-one_hot_encoder_all_features_raw.pkl'
standard_scaler_file=f'{config.MODELS_PATH}/02.1-standard_scaler.pkl'

## 1. Load data

In [2]:
# Load the datasets
dictionary_df=pd.read_csv(dictionary_file)
training_df=pd.read_csv(training_file)

# Set the ID column as the index
training_df.set_index('ID', drop=True, inplace=True)

# Remove the labels from the training dataframe
training_labels_df=training_df[['efs', 'efs_time']].copy()
training_features_df=training_df.drop(['efs', 'efs_time'], axis=1)

# Save
training_features_df.to_parquet(training_features_df_file)
training_labels_df.to_parquet(training_labels_df_file)

print(f'Training data: {training_df.shape}')
print(f'Training features: {training_features_df.shape}')
print(f'Training labels: {training_labels_df.shape}')

Training data: (28800, 59)
Training features: (28800, 57)
Training labels: (28800, 2)


## 2. Encode features

In [3]:
# Encode the features
encoder=OneHotEncoder(drop='first', sparse_output=False)
encoded_training_data=encoder.fit_transform(training_features_df)

# Save the one-hot encoder for later
with open(one_hot_encoder_all_features_raw_file, 'wb') as output_file:
    pickle.dump(encoder, output_file)

# Rebuild the dataframe
feature_names=encoder.get_feature_names_out()

encoded_training_features_df=pd.DataFrame(
    encoded_training_data,
    columns=feature_names
)

print(f'Encoded training features: {encoded_training_features_df.shape}')
encoded_training_features_df.info()

Encoded training features: (28800, 43336)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Columns: 43336 entries, dri_score_High - TED AML case <missing cytogenetics to hla_low_res_10_nan
dtypes: float64(43336)
memory usage: 9.3 GB


In [4]:
# Get lists of categorical and numerical column names
categorical_feature_names=dictionary_df['variable'][dictionary_df['type'] == 'Categorical']
numerical_feature_names=dictionary_df['variable'][dictionary_df['type'] == 'Numerical']

# Remove the feature column from the column names lists
categorical_feature_names=categorical_feature_names[categorical_feature_names != 'efs']
numerical_feature_names=numerical_feature_names[numerical_feature_names != 'efs_time']

# Split the training and testing dataframes
training_categorical_df=training_df[categorical_feature_names].copy()
training_numerical_df=training_df[numerical_feature_names].copy()
testing_categorical_df=testing_df[categorical_feature_names].copy()
testing_numerical_df=testing_df[numerical_feature_names].copy()

print(f'Training numerical features: {training_numerical_df.shape}')
print(f'Training categorical features: {training_categorical_df.shape}')
print(f'Testing numerical features: {testing_numerical_df.shape}')
print(f'Testing categorical features: {testing_categorical_df.shape}')

NameError: name 'testing_df' is not defined

## 3. Handle missing data

### 3.1. Categorical features

In [None]:
training_categorical_df.fillna('Missing', inplace=True)
testing_categorical_df.fillna('Missing', inplace=True)

print(f'Training categorical features: {training_categorical_df.shape}')
print(f'Testing categorical features: {testing_categorical_df.shape}')

### 3.2. Numerical features

In [None]:
# Use K-nearest neighbor imputation to fill in missing data
imputer=KNNImputer(n_neighbors=3, weights='uniform')
imputer.fit(training_numerical_df)

training_numerical_data=imputer.transform(training_numerical_df)
testing_numerical_data=imputer.transform(testing_numerical_df)

# Save the imputer for later
with open(knn_imputer_file, 'wb') as output_file:
    pickle.dump(imputer, output_file)

# Re-build dataframes
training_numerical_df=pd.DataFrame(
    training_numerical_data, 
    columns=training_numerical_df.columns
)

testing_numerical_df=pd.DataFrame(
    testing_numerical_data,
    columns=testing_numerical_df.columns
)

print(f'Training numerical features: {training_numerical_df.shape}')
print(f'Testing numerical features: {testing_numerical_df.shape}')

## 4. One-hot encode categorical features

In [None]:
# Encode the features
encoder=OneHotEncoder(drop='first', sparse_output=False)
encoder.fit(training_categorical_df)

training_categorical_data=encoder.transform(training_categorical_df)
testing_categorical_data=encoder.transform(testing_categorical_df)

# Save the one-hot encoder for later
with open(one_hot_encoder_file, 'wb') as output_file:
    pickle.dump(encoder, output_file)

# Rebuild the dataframes
feature_names=encoder.get_feature_names_out()

training_categorical_df=pd.DataFrame(
    training_categorical_data,
    columns=feature_names
)

testing_categorical_df=pd.DataFrame(
    testing_categorical_data,
    columns=feature_names
)

print(f'Training categorical features: {training_categorical_df.shape}')
print(f'Testing categorical features: {testing_categorical_df.shape}')

## 5. Re-combine numerical and categorical features

In [None]:
training_features_df=pd.concat(
    [
        training_numerical_df.reset_index(drop=True), 
        training_categorical_df.reset_index(drop=True)
    ],
    axis=1
)

testing_features_df=pd.concat(
    [
        testing_numerical_df.reset_index(drop=True), 
        testing_categorical_df.reset_index(drop=True)
    ],
    axis=1
)

print(f'Training features: {training_features_df.shape}')
print(f'Testing features: {testing_features_df.shape}')

## 6. Scale features

In [None]:
scaler=StandardScaler()
scaler.fit(training_features_df)

training_data=scaler.transform(training_features_df)
testing_data=scaler.transform(testing_features_df)

training_features_df=pd.DataFrame(
    training_data,
    columns=training_features_df.columns
)

testing_features_df=pd.DataFrame(
    testing_data,
    columns=testing_features_df.columns
)

with open(standard_scaler_file, 'wb') as output_file:
    pickle.dump(scaler, output_file)

## 7. Transform and scale labels

In [None]:
training_labels_df['efs_time']=np.log(training_labels_df['efs_time'])

scaler=StandardScaler()

training_labels_df['efs_time']=scaler.fit_transform(
    np.array(training_labels_df['efs_time']).reshape(-1, 1)
)

## 8. Save results

In [None]:
# Create data directory if needed
Path(f'{config.DATA_PATH}/processed').mkdir(parents=True, exist_ok=True)

# Add back ID column
training_features_df['ID']=training_ids
testing_features_df['ID']=testing_ids

training_features_df.to_parquet(training_features_df_file, index=False)
training_labels_df.to_parquet(training_labels_df_file, index=False)
testing_features_df.to_parquet(testing_features_df_file, index=False)