In [1]:
from preprocessing import apply_feature_importance

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../dataset/patient_stay_data.csv')
df.head()

Unnamed: 0,subject_id,hadm_id,admission_type,admission_location,ethnicity,hospital_expire_flag,total_los,gender,expire_flag,age_at_admission,...,Diseases Of The Genitourinary System,Complications Of Pregnancy Childbirth And The Puerperium,Diseases Of The Skin And Subcutaneous Tissue,Diseases Of The Musculoskeletal System And Connective Tissue,Congenital Anomalies,Certain Conditions Originating In The Perinatal Period,Symptoms Signs And Ill-Defined Conditions,Injury And Poisoning,Supplementary Classification Of Factors Influencing Health Status And Contact With Health Services,Supplementary Classification Of External Causes Of Injury And Poisoning
0,22,165315,EMERGENCY,EMERGENCY ROOM ADMIT,WHITE,0,1.14,F,0,65,...,0,0,0,0,0,0,0,1,0,1
1,23,152223,ELECTIVE,PHYS REFERRAL/NORMAL DELI,WHITE,0,5.5,M,0,71,...,1,0,0,0,0,0,0,0,1,0
2,23,124321,EMERGENCY,TRANSFER,WHITE,0,6.77,M,0,75,...,0,0,0,0,0,0,1,0,1,0
3,24,161859,EMERGENCY,TRANSFER,WHITE,0,2.86,M,0,39,...,0,0,0,0,0,0,0,0,0,0
4,25,129635,EMERGENCY,EMERGENCY ROOM ADMIT,WHITE,0,3.53,M,0,59,...,0,0,0,0,0,0,0,0,0,0


### Encode Categorical Features & Clean Numerical Features

In [4]:
# Encoding Categorical Features
binary_features = ['gender']
nominal_features = ['admission_type', 'admission_location', 'ethnicity', 'first_careunit']
numerical_features = ['total_los', 'icu_los']

df = apply_feature_importance.encode_features(df, binary_features=binary_features, nominal_features=nominal_features)

# Fill missing values in numerical features
for feature in numerical_features:
    df[feature].fillna(0, inplace=True)

# Drop irrelevant features
df.drop(columns=['hadm_id', 'subject_id', 'icustay_id', 'expire_flag'], inplace=True)

df.head()

Unnamed: 0,hospital_expire_flag,total_los,gender,age_at_admission,icu_los,Infectious And Parasitic Diseases,Neoplasms,Endocrine Nutritional And Metabolic Diseases And Immunity Disorders,Diseases Of The Blood And Blood-Forming Organs,Mental Disorders,...,ethnicity_NATIVE AMERICAN,ethnicity_OTHER/UNKNOWN,ethnicity_WHITE,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_NICU,first_careunit_SICU,first_careunit_TSICU,first_careunit_nan
0,0,1.14,0,65,1.1438,0,0,0,0,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0,5.5,1,71,1.2641,0,0,1,0,0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,6.77,1,75,1.1862,0,1,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,2.86,1,39,0.5124,0,0,1,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,3.53,1,59,3.5466,0,0,1,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature Importance - RandomForestClassifier

In [None]:
np.set_printoptions(precision=3)

In [5]:
# Applying Feature Importance using Random Forest
rf_importance, rf_accuracies = apply_feature_importance.get_feature_importance(df=df, classifier='rf')

for feature, importance in rf_importance:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: total_los, Importance: 0.21740557354742834
Feature: icu_los, Importance: 0.15527937574947354
Feature: age_at_admission, Importance: 0.11722867865153333
Feature: Diseases Of The Respiratory System, Importance: 0.02543474702415862
Feature: gender, Importance: 0.02324890669832837
Feature: Infectious And Parasitic Diseases, Importance: 0.023241191720940628
Feature: Supplementary Classification Of Factors Influencing Health Status And Contact With Health Services, Importance: 0.021832181087808773
Feature: Diseases Of The Genitourinary System, Importance: 0.021618050240802097
Feature: Diseases Of The Nervous System And Sense Organs, Importance: 0.021132206583591336
Feature: Diseases Of The Digestive System, Importance: 0.02108268413499686
Feature: Diseases Of The Blood And Blood-Forming Organs, Importance: 0.02107288105088117
Feature: Neoplasms, Importance: 0.020286632207728077
Feature: Symptoms Signs And Ill-Defined Conditions, Importance: 0.020199037853928955
Feature: Endocrine Nu

In [6]:
for acc_type, value in rf_accuracies.items():
    print(f"{acc_type}: {value}")

Accuracy: 0.9129510548971674
Precision: 0.8581818181818182
Recall: 0.23228346456692914
F1 Score: 0.36560805577072036
AUC-ROC: 0.613818229155672


### Feature Importance - XBoost Classifier

In [7]:
xgb_importance, xgb_accuracies = apply_feature_importance.get_feature_importance(df=df, classifier='xgb')

for feature, importance in xgb_importance:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: admission_type_EMERGENCY, Importance: 0.20254257321357727
Feature: Diseases Of The Respiratory System, Importance: 0.11470794677734375
Feature: Infectious And Parasitic Diseases, Importance: 0.0537729486823082
Feature: Neoplasms, Importance: 0.03196145221590996
Feature: Diseases Of The Genitourinary System, Importance: 0.031357865780591965
Feature: total_los, Importance: 0.028654171153903008
Feature: first_careunit_CSRU, Importance: 0.02651909552514553
Feature: Diseases Of The Circulatory System, Importance: 0.02471020258963108
Feature: icu_los, Importance: 0.022448962554335594
Feature: ethnicity_OTHER/UNKNOWN, Importance: 0.022287027910351753
Feature: admission_location_PHYS REFERRAL/NORMAL DELI, Importance: 0.022205017507076263
Feature: Injury And Poisoning, Importance: 0.020774589851498604
Feature: Symptoms Signs And Ill-Defined Conditions, Importance: 0.020505912601947784
Feature: Mental Disorders, Importance: 0.019846592098474503
Feature: Supplementary Classification Of E

In [8]:
for acc_type, value in xgb_accuracies.items():
    print(f"{acc_type}: {value}")

Accuracy: 0.9208694265823457
Precision: 0.7448151487826871
Recall: 0.406496062992126
F1 Score: 0.5259471505889844
AUC-ROC: 0.694817885532405


### Feature Importance - SelectKBest

In [9]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['hospital_expire_flag'])
y = df['hospital_expire_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
#
# kbest = SelectKBest(score_func=chi2, k=10)
#
# fit = kbest.fit(X, y)
#
# # Get the scores associated with each feature
# np.set_printoptions(precision=3)
# print(fit.scores_)
#
# X_new = fit.transform(X)
#
# mask = kbest.get_support()
# new_features = [] # The list of the K best features
#
# for bool, feature in zip(mask, X.columns):
#     if bool:
#         new_features.append(feature)
# print(new_features)

ValueError: Input X must be non-negative.