# HSCT survival data feature engineering

## Notebook set-up

In [1]:
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

## 1. Load data

In [2]:
# Load the datasets
dictionary_df=pd.read_csv('./data/raw/data_dictionary.csv')
training_df=pd.read_csv('./data/raw/train.csv')
testing_df=pd.read_csv('./data/raw/test.csv')

# Save the ID and drop
training_ids=training_df['ID']
testing_id=testing_df['ID']
training_df.drop('ID', axis=1, inplace=True)
testing_df.drop('ID', axis=1, inplace=True)

# Remove the labels from the training dataframe
training_labels_df=training_df[['efs', 'efs_time']]
training_df.drop(['efs', 'efs_time'], axis=1, inplace=True)

## 2. Split categorical and numerical features

In [3]:
# Get lists of categorical and numerical column names
categorical_feature_names=dictionary_df['variable'][dictionary_df['type'] == 'Categorical']
numerical_feature_names=dictionary_df['variable'][dictionary_df['type'] == 'Numerical']

# Remove the feature column from the column names lists
categorical_feature_names=categorical_feature_names[categorical_feature_names != 'efs']
numerical_feature_names=numerical_feature_names[numerical_feature_names != 'efs_time']

# Split the training and testing dataframes
training_categorical_df=training_df[categorical_feature_names].copy()
training_numerical_df=training_df[numerical_feature_names].copy()
testing_categorical_df=testing_df[categorical_feature_names].copy()
testing_numerical_df=testing_df[numerical_feature_names].copy()

## 3. Handle missing data

### 3.1. Categorical features

In [4]:
training_categorical_df.fillna('Missing', inplace=True)
testing_categorical_df.fillna('Missing', inplace=True)

### 3.2. Numerical features

In [5]:
imputer=KNNImputer(n_neighbors=3, weights="uniform")
imputer.fit(training_numerical_df)

training_numerical_data=imputer.transform(training_numerical_df)
testing_numerical_data=imputer.transform(testing_numerical_df)

training_numerical_df=pd.DataFrame(training_numerical_data, columns=training_numerical_df.columns)
testing_numerical_df=pd.DataFrame(training_numerical_data, columns=training_numerical_df.columns)
training_numerical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   hla_match_c_high     28800 non-null  float64
 1   hla_high_res_8       28800 non-null  float64
 2   hla_low_res_6        28800 non-null  float64
 3   hla_high_res_6       28800 non-null  float64
 4   hla_high_res_10      28800 non-null  float64
 5   hla_match_dqb1_high  28800 non-null  float64
 6   hla_nmdp_6           28800 non-null  float64
 7   hla_match_c_low      28800 non-null  float64
 8   hla_match_drb1_low   28800 non-null  float64
 9   hla_match_dqb1_low   28800 non-null  float64
 10  year_hct             28800 non-null  float64
 11  hla_match_a_high     28800 non-null  float64
 12  donor_age            28800 non-null  float64
 13  hla_match_b_low      28800 non-null  float64
 14  age_at_hct           28800 non-null  float64
 15  hla_match_a_low      28800 non-null 

## 3. One-hot encode categorical features

In [6]:
encoder=OneHotEncoder(drop='first',sparse_output=False)
encoder.fit(training_categorical_df)
feature_names=encoder.get_feature_names_out()

training_categorical_data=encoder.transform(training_categorical_df)
testing_categorical_data=encoder.transform(testing_categorical_df)

training_categorical_df=pd.DataFrame(training_categorical_data, columns=feature_names)
testing_categorical_df=pd.DataFrame(testing_categorical_data, columns=feature_names)

training_features_df=pd.concat(
    [
        training_numerical_df.reset_index(drop=True), 
        training_categorical_df.reset_index(drop=True)
    ],
    axis=1
)

testing_features_df=pd.concat(
    [
        testing_numerical_df.reset_index(drop=True), 
        training_categorical_df.reset_index(drop=True)
    ],
    axis=1
)

training_features_df.head()

Unnamed: 0,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,...,donor_related_Related,donor_related_Unrelated,melphalan_dose_Missing,"melphalan_dose_N/A, Mel not given",cardiac_No,cardiac_Not done,cardiac_Yes,pulm_moderate_No,pulm_moderate_Not done,pulm_moderate_Yes
0,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,2.0,8.0,6.0,6.0,10.0,2.0,5.0,2.0,2.0,2.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## 4. Save results

In [7]:
# Create data directory if needed
Path('./data/encoded').mkdir(parents=True, exist_ok=True)

training_features_df.to_parquet('./data/encoded/training_features_df.parquet', index=False)
training_labels_df.to_parquet('./data/encoded/training_labels.parquet', index=False)
testing_features_df.to_parquet('./data/encoded/testing_features_df.parquet', index=False)