In [56]:
# Import basic libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')  # Ignore warning messages

# Import preprocessing and modeling tools
from sklearn.preprocessing import LabelEncoder  # For encoding categorical labels

# Import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier  # XGBoost classifier

# Import splitting and validation tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold  # Group-wise cross-validation

# Import evaluation metric
from sklearn.metrics import roc_auc_score  # AUC score for model evaluation


In [58]:
path = "/content/drive/MyDrive/Datasets/training_v2.csv"
df = pd.read_csv(path)

# **Analyzing the Data**

In [23]:
df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,gender,height,icu_id,...,apache_3j_bodysystem_Trauma,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
0,66154,25312,118,0,68.0,22.73,0,1,180.3,92,...,False,False,False,False,False,False,False,False,False,False
1,114252,59342,81,0,77.0,27.42,0,0,160.0,90,...,False,False,False,False,False,False,True,False,False,False
2,119783,50777,118,0,25.0,31.95,0,0,172.7,93,...,False,False,False,True,False,False,False,False,False,False
3,79267,46918,118,0,81.0,22.64,1,0,165.1,92,...,False,False,False,False,False,False,False,False,False,False
4,92056,34377,33,0,19.0,29.185818,0,1,188.0,91,...,True,False,False,False,False,False,False,True,False,False


In [24]:
df.columns

Index(['encounter_id', 'patient_id', 'hospital_id', 'hospital_death', 'age',
       'bmi', 'elective_surgery', 'gender', 'height', 'icu_id',
       ...
       'apache_3j_bodysystem_Trauma', 'apache_2_bodysystem_Gastrointestinal',
       'apache_2_bodysystem_Haematologic', 'apache_2_bodysystem_Metabolic',
       'apache_2_bodysystem_Neurologic',
       'apache_2_bodysystem_Renal/Genitourinary',
       'apache_2_bodysystem_Respiratory', 'apache_2_bodysystem_Trauma',
       'apache_2_bodysystem_Undefined Diagnoses',
       'apache_2_bodysystem_Undefined diagnoses'],
      dtype='object', length=230)

In [25]:
# Display top 178 columns with highest missing values
df.isnull().sum().sort_values(ascending=False).head(178)

Unnamed: 0,0
encounter_id,0
patient_id,0
hospital_id,0
hospital_death,0
age,0
...,...
diabetes_mellitus,0
hepatic_failure,0
immunosuppression,0
leukemia,0


In [26]:
df_p = df[df["patient_id"] == 34377	]
df_p

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,gender,height,icu_id,...,apache_3j_bodysystem_Trauma,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
4,92056,34377,33,0,19.0,29.185818,0,1,188.0,91,...,True,False,False,False,False,False,False,True,False,False


In [27]:
df_unique = df['encounter_id'].unique()
df_unique.shape

(91713,)

In [28]:
df.shape

(91713, 230)

In [29]:
# Function to return table of total and percent missing values in each column
def draw_missing_data_table(df):
    total = df.isnull().sum().sort_values(ascending=False)  # total missing
    percent = (df.isnull().sum() * 100 / df.isnull().count()).sort_values(ascending=False)  # percent missing
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])  # combine into one table
    return missing_data


In [30]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
encounter_id,0,0.0
patient_id,0,0.0
hospital_id,0,0.0
hospital_death,0,0.0
age,0,0.0
...,...,...
apache_2_bodysystem_Renal/Genitourinary,0,0.0
apache_2_bodysystem_Respiratory,0,0.0
apache_2_bodysystem_Trauma,0,0.0
apache_2_bodysystem_Undefined Diagnoses,0,0.0


From analysis above the Data seems to have a lot of null values that have to be dealt with.

# ***Trying out a simple and basic approach***

Replace all the missing values with mean and mode depening on the variables being numerical or categorical respectively


In [19]:
# Fill missing values in numeric columns with their respective column means
numeric_cols = df.select_dtypes(include=np.number).columns
numeric_means = df[numeric_cols].mean()
df[numeric_cols] = df[numeric_cols].fillna(numeric_means)

In [20]:
# Fill missing values in categorical columns with their mode (most frequent value)
category_cols = df.select_dtypes(include=['object']).columns
for col in category_cols:
    if df[col].isnull().any():
        mode_value = df[col].mode()[0]
        df[col] = df[col].fillna(mode_value)

In [71]:
# Encode categorical variables: label encoding for binary columns and one-hot encoding for multi-category columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
exclude_cols = ['encounter_id', 'patient_id', 'hospital_id', 'hospital_death']
categorical_cols = [col for col in categorical_cols if col not in exclude_cols]

binary_cols = [col for col in categorical_cols if df[col].nunique() == 2]
multi_cols = [col for col in categorical_cols if df[col].nunique() > 2]

label_encoders = {}
for col in binary_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

if multi_cols:
    df = pd.get_dummies(df, columns=multi_cols, drop_first=True)

X = df.drop(columns=['encounter_id', 'patient_id', 'hospital_id', 'hospital_death'])
y = df['hospital_death']


In [72]:
X = df.drop(columns=['patient_id', 'hospital_death'])
y = df['hospital_death']

In [73]:
# For implementing DecisionTreeClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42, max_depth = None)
dt_classifier.fit(X_train, y_train)


In [74]:
# For ROC AUC Score
y_pred_proba = dt_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")


ROC AUC Score: 0.6666801268264237


Conclusion: The revised dataframe doesn't perform great. Partial reason could be that a lot of rows were empty as collected data from hospitals depend only on what disease is being diagnosed where only relevant info to that disease is collected. Therefore most rows being empty

# ***Improved method to tackle missing values***

In [75]:
# Remove columns with more than 10% missing values
missing_percent = df.isnull().sum() / df.isnull().count()
df_new = df[missing_percent[missing_percent < 0.10].index]
df_new

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,gender,height,icu_id,...,apache_3j_bodysystem_Trauma,apache_2_bodysystem_Gastrointestinal,apache_2_bodysystem_Haematologic,apache_2_bodysystem_Metabolic,apache_2_bodysystem_Neurologic,apache_2_bodysystem_Renal/Genitourinary,apache_2_bodysystem_Respiratory,apache_2_bodysystem_Trauma,apache_2_bodysystem_Undefined Diagnoses,apache_2_bodysystem_Undefined diagnoses
0,66154,25312,118,0,68.0,22.730000,0,1,180.3,92,...,False,False,False,False,False,False,False,False,False,False
1,114252,59342,81,0,77.0,27.420000,0,0,160.0,90,...,False,False,False,False,False,False,True,False,False,False
2,119783,50777,118,0,25.0,31.950000,0,0,172.7,93,...,False,False,False,True,False,False,False,False,False,False
3,79267,46918,118,0,81.0,22.640000,1,0,165.1,92,...,False,False,False,False,False,False,False,False,False,False
4,92056,34377,33,0,19.0,,0,1,188.0,91,...,True,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91708,91592,78108,30,0,75.0,23.060250,0,1,177.8,927,...,False,False,False,False,False,False,False,False,False,False
91709,66119,13486,121,0,56.0,47.179671,0,0,183.0,925,...,False,False,False,False,False,False,False,False,False,False
91710,8981,58179,195,0,48.0,27.236914,0,1,170.2,908,...,False,False,False,True,False,False,False,False,False,False
91711,33776,120598,66,0,,23.297481,0,0,154.9,922,...,False,False,False,False,False,False,True,False,False,False


In [76]:
draw_missing_data_table(df_new).head(75)

Unnamed: 0,Total,Percent
h1_mbp_noninvasive_max,9084,9.904812
h1_mbp_noninvasive_min,9084,9.904812
apache_4a_icu_death_prob,7947,8.665075
apache_4a_hospital_death_prob,7947,8.665075
h1_diasbp_noninvasive_min,7350,8.014131
...,...,...
apache_post_operative,0,0.000000
icu_id,0,0.000000
elective_surgery,0,0.000000
gender,0,0.000000


In [77]:
df_new.dtypes

Unnamed: 0,0
encounter_id,int64
patient_id,int64
hospital_id,int64
hospital_death,int64
age,float64
...,...
apache_2_bodysystem_Renal/Genitourinary,bool
apache_2_bodysystem_Respiratory,bool
apache_2_bodysystem_Trauma,bool
apache_2_bodysystem_Undefined Diagnoses,bool


In [78]:
# Fill missing values in numeric columns of df_new with their respective column means
numeric_cols = df_new.select_dtypes(include=np.number).columns
numeric_means = df_new[numeric_cols].mean()
df_new[numeric_cols] = df_new[numeric_cols].fillna(numeric_means)

In [79]:
draw_missing_data_table(df_new).head(10)

Unnamed: 0,Total,Percent
encounter_id,0,0.0
patient_id,0,0.0
hospital_id,0,0.0
hospital_death,0,0.0
age,0,0.0
bmi,0,0.0
elective_surgery,0,0.0
gender,0,0.0
height,0,0.0
icu_id,0,0.0


In [80]:
# Fill missing values in categorical columns of df_new with their mode
category_cols = df_new.select_dtypes(include=['object']).columns
for col in category_cols:
    if df_new[col].isnull().any():
        mode_value = df_new[col].mode()[0]
        df_new[col] = df_new[col].fillna(mode_value)

In [81]:
draw_missing_data_table(df_new).head(75)

Unnamed: 0,Total,Percent
encounter_id,0,0.0
patient_id,0,0.0
hospital_id,0,0.0
hospital_death,0,0.0
age,0,0.0
...,...,...
cirrhosis,0,0.0
diabetes_mellitus,0,0.0
hepatic_failure,0,0.0
immunosuppression,0,0.0


In [82]:
# Encode categorical variables in df_new: label encode binary columns and one-hot encode multi-category columns
categorical_cols = df_new.select_dtypes(include=['object']).columns.tolist()
exclude_cols = ['encounter_id', 'patient_id', 'hospital_id', 'hospital_death']
categorical_cols = [col for col in categorical_cols if col not in exclude_cols]

binary_cols = [col for col in categorical_cols if df_new[col].nunique() == 2]
multi_cols = [col for col in categorical_cols if df_new[col].nunique() > 2]

label_encoders = {}
for col in binary_cols:
    le = LabelEncoder()
    df_new[col] = le.fit_transform(df_new[col])
    label_encoders[col] = le

if multi_cols:
    df_new = pd.get_dummies(df_new, columns=multi_cols, drop_first=True)

X = df_new.drop(columns=['encounter_id', 'patient_id', 'hospital_id', 'hospital_death'])
y = df_new['hospital_death']


In [83]:
X = df_new.drop(columns=['patient_id', 'hospital_death'])
y = df_new['hospital_death']

***Implementing using decision trees***

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42, max_depth = 18)
dt_classifier.fit(X_train, y_train)

In [94]:
tree_depth = dt_classifier.get_depth()
print(f"Tree Depth: {tree_depth}")

Tree Depth: 18


In [95]:
# For ROC AUC Score
from sklearn.metrics import roc_auc_score
y_pred_proba = dt_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")


ROC AUC Score: 0.6599377091458736


**Conclusion**: It did pretty similar. So there must be some minor detail that must be missed out

# ***Class Imbalance***

Recognizing the fact that dataset is imbalanced

In [59]:
print('Alive', round(df['hospital_death'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Death Confirmed', round(df['hospital_death'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

Alive 91.37 % of the dataset
Death Confirmed 8.63 % of the dataset


In [96]:
X = df_new.drop(columns=['encounter_id', 'patient_id', 'hospital_id', 'hospital_death'], errors='ignore')
y = df_new['hospital_death']

# Keep encounter_id for GroupKFold grouping
X_with_groups = df_new.drop(columns=['patient_id', 'hospital_id', 'hospital_death'], errors='ignore')
y_with_groups = df_new['hospital_death']


In [97]:
# Initial train-test split (stratified)
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X_with_groups,
    y_with_groups,
    test_size=0.25,
    stratify=y_with_groups,
    random_state=42
)

train_groups = X_train_orig['encounter_id']

X_train_orig = X_train_orig.drop(columns=['encounter_id'])
X_test_orig = X_test_orig.drop(columns=['encounter_id'])


In [99]:
# Function to drop highly correlated features above the given threshold
def correlation_filter(X, threshold=0.9):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return X.drop(columns=to_drop), to_drop

In [100]:
# Ratio of negative to positive class
neg, pos = np.bincount(y_train_orig)
scale_pos_weight = neg / pos
print(f"Scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

Scale_pos_weight for XGBoost: 10.59


In [101]:
#Cross-Validation with XGBoost
gkf = GroupKFold(n_splits=5)
auc_scores = []
for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train_orig, y_train_orig, train_groups)):
    print(f"\n Fold {fold + 1}/5")

    # Split data for this fold
    X_train_fold, X_val_fold = X_train_orig.iloc[train_idx], X_train_orig.iloc[val_idx]
    y_train_fold, y_val_fold = y_train_orig.iloc[train_idx], y_train_orig.iloc[val_idx]

    # Correlation filter (fit only on train fold)
    X_train_fold_filtered, dropped_cols = correlation_filter(X_train_fold, threshold=0.9)
    X_val_fold_filtered = X_val_fold.drop(columns=dropped_cols, errors='ignore')

    # Define XGBoost model
    xgb_model = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss',
        n_jobs=-1
    )

    # Train
    xgb_model.fit(X_train_fold_filtered, y_train_fold)

    # Predict
    y_pred_proba = xgb_model.predict_proba(X_val_fold_filtered)[:, 1]

    # AUC
    auc = roc_auc_score(y_val_fold, y_pred_proba)
    auc_scores.append(auc)

    print(f" Fold {fold + 1} AUC: {auc:.4f}")

# Cross-Validation Results

mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)
print(f"\n Mean AUC across folds: {mean_auc:.4f} ± {std_auc:.4f}")

# Train Final Model
# Correlation filter on full training set
X_train_filtered, dropped_cols_final = correlation_filter(X_train_orig, threshold=0.9)
X_test_filtered = X_test_orig.drop(columns=dropped_cols_final, errors='ignore')

# Final XGBoost model
final_xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1
)

final_xgb_model.fit(X_train_filtered, y_train_orig)

# Evaluation
# Predict
y_train_pred = final_xgb_model.predict(X_train_filtered)
y_test_pred = final_xgb_model.predict(X_test_filtered)

# Probabilities (for AUC)
y_train_proba = final_xgb_model.predict_proba(X_train_filtered)[:, 1]
y_test_proba = final_xgb_model.predict_proba(X_test_filtered)[:, 1]


print('AUC Score: {:.2f}'.format(roc_auc_score(y_test_orig, y_test_proba)))


 Fold 1/5
 Fold 1 AUC: 0.8969

 Fold 2/5
 Fold 2 AUC: 0.8830

 Fold 3/5
 Fold 3 AUC: 0.8836

 Fold 4/5
 Fold 4 AUC: 0.8879

 Fold 5/5
 Fold 5 AUC: 0.8887

 Mean AUC across folds: 0.8880 ± 0.0050
AUC Score: 0.89
