# HSCT survival data feature engineering

## Notebook set-up

In [1]:
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

## 1. Load data

In [2]:
# Load the datasets
dictionary_df=pd.read_csv('./data/raw/data_dictionary.csv')
training_df=pd.read_csv('./data/raw/train.csv')
testing_df=pd.read_csv('./data/raw/test.csv')

# Drop the 'ID' column
training_df.drop('ID', axis=1, inplace=True)
testing_df.drop('ID', axis=1, inplace=True)

# Remove the labels from the training dataframe
training_labels=training_df['efs']
training_df.drop('efs', axis=1, inplace=True)

## 2. Split categorical and numerical features

In [3]:
# Get lists of categorical and numerical column names
categorical_feature_names=dictionary_df['variable'][dictionary_df['type'] == 'Categorical']
numerical_feature_names=dictionary_df['variable'][dictionary_df['type'] == 'Numerical']

# Remove the feature column from the column names lists
categorical_feature_names=categorical_feature_names[~categorical_feature_names.isin(['efs','efs_time'])]
numerical_feature_names=numerical_feature_names[~numerical_feature_names.isin(['efs','efs_time'])]

# Split the training and testing dataframes
training_categorical_df=training_df[categorical_feature_names]
training_numerical_df=training_df[numerical_feature_names]
testing_categorical_df=testing_df[categorical_feature_names]
testing_numerical_df=testing_df[numerical_feature_names]

## 3. One-hot encode categorical features

In [4]:
encoder=OneHotEncoder(drop='first',sparse_output=False)
encoder.fit(training_categorical_df)
feature_names=encoder.get_feature_names_out()

training_categorical_data=encoder.transform(training_categorical_df)
testing_categorical_data=encoder.transform(testing_categorical_df)

training_categorical_df=pd.DataFrame(training_categorical_data, columns=feature_names)
testing_categorical_df=pd.DataFrame(testing_categorical_data, columns=feature_names)

training_features_df=pd.concat(
    [
        training_numerical_df.reset_index(drop=True), 
        training_categorical_df.reset_index(drop=True)
    ],
    axis=1
)

testing_features_df=pd.concat(
    [
        testing_numerical_df.reset_index(drop=True), 
        training_categorical_df.reset_index(drop=True)
    ],
    axis=1
)

training_features_df.head()

Unnamed: 0,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,hla_match_dqb1_low,...,donor_related_Unrelated,donor_related_nan,"melphalan_dose_N/A, Mel not given",melphalan_dose_nan,cardiac_Not done,cardiac_Yes,cardiac_nan,pulm_moderate_Not done,pulm_moderate_Yes,pulm_moderate_nan
0,,,6.0,6.0,,2.0,6.0,2.0,2.0,2.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,2.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,8.0,6.0,6.0,10.0,2.0,5.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4. Save results

In [5]:
# Create data directory if needed
Path('./data/encoded').mkdir(parents=True, exist_ok=True)

training_features_df.to_parquet('./data/encoded/training_features_df.parquet')
training_labels.to_csv('./data/encoded/training_labels.csv')
testing_features_df.to_parquet('./data/encoded/testing_features_df.parquet')