# HSCT survival data feature engineering

## Notebook set-up

In [1]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import power_transform, StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer

## 1. Load data

In [2]:
# Load the datasets
dictionary_df=pd.read_csv('./data/raw/data_dictionary.csv')
training_df=pd.read_csv('./data/raw/train.csv')
testing_df=pd.read_csv('./data/raw/test.csv')

# Save the ID and drop
training_ids=training_df['ID']
testing_ids=testing_df['ID']
training_df.drop('ID', axis=1, inplace=True)
testing_df.drop('ID', axis=1, inplace=True)

# Remove the labels from the training dataframe
training_labels_df=training_df[['efs', 'efs_time']].copy()
training_df.drop(['efs', 'efs_time'], axis=1, inplace=True)

print(f'Training features: {training_df.shape}')
print(f'Testing features: {testing_df.shape}')

Training features: (28800, 57)
Testing features: (3, 57)


## 2. Split categorical and numerical features

In [3]:
# Get lists of categorical and numerical column names
categorical_feature_names=dictionary_df['variable'][dictionary_df['type'] == 'Categorical']
numerical_feature_names=dictionary_df['variable'][dictionary_df['type'] == 'Numerical']

# Remove the feature column from the column names lists
categorical_feature_names=categorical_feature_names[categorical_feature_names != 'efs']
numerical_feature_names=numerical_feature_names[numerical_feature_names != 'efs_time']

# Split the training and testing dataframes
training_categorical_df=training_df[categorical_feature_names].copy()
training_numerical_df=training_df[numerical_feature_names].copy()
testing_categorical_df=testing_df[categorical_feature_names].copy()
testing_numerical_df=testing_df[numerical_feature_names].copy()

print(f'Training numerical features: {training_numerical_df.shape}')
print(f'Training categorical features: {training_categorical_df.shape}')
print(f'Testing numerical features: {testing_numerical_df.shape}')
print(f'Testing categorical features: {testing_categorical_df.shape}')

Training numerical features: (28800, 22)
Training categorical features: (28800, 35)
Testing numerical features: (3, 22)
Testing categorical features: (3, 35)


## 3. Handle missing data

### 3.1. Categorical features

In [4]:
training_categorical_df.fillna('Missing', inplace=True)
testing_categorical_df.fillna('Missing', inplace=True)

print(f'Training categorical features: {training_categorical_df.shape}')
print(f'Testing categorical features: {testing_categorical_df.shape}')

Training categorical features: (28800, 35)
Testing categorical features: (3, 35)


### 3.2. Numerical features

In [5]:
# Use K-nearest neighbor imputation to fill in missing data
imputer=KNNImputer(n_neighbors=3, weights='uniform')
imputer.fit(training_numerical_df)

training_numerical_data=imputer.transform(training_numerical_df)
testing_numerical_data=imputer.transform(testing_numerical_df)

# Re-build dataframes
training_numerical_df=pd.DataFrame(training_numerical_data, columns=training_numerical_df.columns)
testing_numerical_df=pd.DataFrame(testing_numerical_data, columns=training_numerical_df.columns)

print(f'Training numerical features: {training_numerical_df.shape}')
print(f'Testing numerical features: {testing_numerical_df.shape}')

Training numerical features: (28800, 22)
Testing numerical features: (3, 22)


## 4. Transform numerical features

### 4.1. EFS time label

In [6]:
# Standard scale EFS time
efs_times=np.array(training_labels_df['efs_time']).reshape(-1, 1)
label_scaler=StandardScaler()
label_data=label_scaler.fit_transform(efs_times)

# Save the scaler for later
with open('./models/standard_scaler_efs_label.pkl', 'wb') as output_file:
    pickle.dump(label_scaler, output_file)

# Add scaled efs time back to dataframe
training_labels_df['efs_time']=label_data

### 4.2. Numerical features

In [7]:
# Standard numerical features
feature_scaler=StandardScaler()
feature_scaler.fit(training_numerical_df)
training_numerical_data=feature_scaler.transform(training_numerical_df)
testing_numerical_data=feature_scaler.transform(testing_numerical_df)

# Save the scaler for later
with open('./models/standard_scaler_numerical_features.pkl', 'wb') as output_file:
    pickle.dump(feature_scaler, output_file)

# Rebuild the dataframes
training_numerical_df=pd.DataFrame(training_numerical_data, columns=training_numerical_df.columns)
testing_numerical_df=pd.DataFrame(testing_numerical_data, columns=testing_numerical_df.columns)

## 5. One-hot encode categorical features

In [8]:
# Encode the features
encoder=OneHotEncoder(drop='first', sparse_output=False)
encoder.fit(training_categorical_df)

training_categorical_data=encoder.transform(training_categorical_df)
testing_categorical_data=encoder.transform(testing_categorical_df)

# Rebuild the dataframes
feature_names=encoder.get_feature_names_out()
training_categorical_df=pd.DataFrame(training_categorical_data, columns=feature_names)
testing_categorical_df=pd.DataFrame(testing_categorical_data, columns=feature_names)

print(f'Training categorical features: {training_categorical_df.shape}')
print(f'Testing categorical features: {testing_categorical_df.shape}')

Training categorical features: (28800, 156)
Testing categorical features: (3, 156)


## 6. Re-combine numerical and categorical features

In [9]:
training_features_df=pd.concat(
    [
        training_numerical_df.reset_index(drop=True), 
        training_categorical_df.reset_index(drop=True)
    ],
    axis=1
)

testing_features_df=pd.concat(
    [
        testing_numerical_df.reset_index(drop=True), 
        testing_categorical_df.reset_index(drop=True)
    ],
    axis=1
)

print(f'Training features: {training_features_df.shape}')
print(f'Testing features: {testing_features_df.shape}')

Training features: (28800, 178)
Testing features: (3, 178)


## 7. Save results

In [10]:
# Create data directory if needed
Path('./data/encoded').mkdir(parents=True, exist_ok=True)

# Add back ID column
training_features_df['ID']=training_ids
testing_features_df['ID']=testing_ids

training_features_df.to_parquet('./data/encoded/training_features_df.parquet', index=False)
training_labels_df.to_parquet('./data/encoded/training_labels_df.parquet', index=False)
testing_features_df.to_parquet('./data/encoded/testing_features_df.parquet', index=False)