In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from time import time
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import pickle

from sklearn.model_selection import train_test_split


warnings.filterwarnings('ignore')
np.random.seed(42)

In [None]:
data = pd.read_csv('hcare.csv')

In [None]:
print(f'Dataset shape: {data.shape}')

Dataset shape: (318438, 18)


In [None]:
data.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [None]:
data.dtypes

case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     int64
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
patientid                              int64
City_Code_Patient                    float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
Stay                                  object
dtype: object

In [None]:
data.isnull().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

Removing case_id, patient and City_Code_Patient ID as they are irrelevant to the patient's length of stay. Two pairs, "Hospital_region_code" and "City_Code_Hospital" as well as "Hospital_code" and "Hospital_type_code," we found high mutual information, so we will discard Hospital_code and City_Code_Hospital as well.



In [None]:
data.drop(['case_id', 'patientid', 'City_Code_Patient', 'Hospital_code', 'City_Code_Hospital'  ], axis=1, inplace=True)

Removing the missing instance of the Bed Grade feature.


In [None]:
data.dropna(subset = ['Bed Grade'], inplace = True)
data.isnull().sum()

Hospital_type_code                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [None]:
print(f'Duplicates in the dataset: {data.duplicated().sum()}')
print(f'Percentage of duplicates: {data.duplicated().sum()/len(data)*100}%')

Duplicates in the dataset: 428
Percentage of duplicates: 0.13445378151260504%


In [None]:
data = data.drop_duplicates()
print(f'Duplicates in the dataset: {data.duplicated().sum()}')
print(f'Percentage of duplicates: {data.duplicated().sum()/len(data)*100}%')

Duplicates in the dataset: 0
Percentage of duplicates: 0.0%


In [None]:
data['Stay'].replace ('More than 100 Days', '>100', inplace=True)

In [None]:
data['Stay']= data['Stay'].replace({'0-10':0, '11-20':0, '21-30':1, '31-40':1, '41-50':1, '51-60':2,'61-70':2,'71-80':2,'81-90':2,'91-100':2,'>100':2})

In [None]:
for feature in data.columns:
    if data[feature].dtype == object:
        print('\nFeature:',feature)
        print(pd.Categorical(data[feature].unique()))


Feature: Hospital_type_code
['c', 'e', 'b', 'a', 'f', 'd', 'g']
Categories (7, object): ['a', 'b', 'c', 'd', 'e', 'f', 'g']

Feature: Hospital_region_code
['Z', 'X', 'Y']
Categories (3, object): ['X', 'Y', 'Z']

Feature: Department
['radiotherapy', 'anesthesia', 'gynecology', 'TB & Chest disease', 'surgery']
Categories (5, object): ['TB & Chest disease', 'anesthesia', 'gynecology', 'radiotherapy', 'surgery']

Feature: Ward_Type
['R', 'S', 'Q', 'P', 'T', 'U']
Categories (6, object): ['P', 'Q', 'R', 'S', 'T', 'U']

Feature: Ward_Facility_Code
['F', 'E', 'D', 'B', 'A', 'C']
Categories (6, object): ['A', 'B', 'C', 'D', 'E', 'F']

Feature: Type of Admission
['Emergency', 'Trauma', 'Urgent']
Categories (3, object): ['Emergency', 'Trauma', 'Urgent']

Feature: Severity of Illness
['Extreme', 'Moderate', 'Minor']
Categories (3, object): ['Extreme', 'Minor', 'Moderate']

Feature: Age
['51-60', '71-80', '31-40', '41-50', '81-90', '61-70', '21-30', '11-20', '0-10', '91-100']
Categories (10, objec

In [None]:
data.dtypes

Hospital_type_code                    object
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
Stay                                   int64
dtype: object

In [None]:
data['Bed Grade'] = data['Bed Grade'].astype(object)

In [None]:
cat_col = list(data.select_dtypes(include=['object']).columns)
label_enc_col = ['Bed Grade','Type of Admission','Severity of Illness', 'Age']
one_hot_enc_col = list(set(cat_col) - set(label_enc_col))

In [None]:
label_enc_col = ['Bed Grade','Type of Admission','Severity of Illness', 'Age','Stay']

In [None]:
one_hot_enc_col = list(set(cat_col) - set(label_enc_col))

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
for col in label_enc_col:
    data[col] = label.fit_transform(data[col])


data = pd.get_dummies(data=data, columns=one_hot_enc_col, dtype=int)


In [None]:
X = data.drop('Stay', axis=1)
Y = data[['Stay']]

In [None]:
(Y['Stay'].value_counts())

Stay
1    154092
0    101588
2     62217
Name: count, dtype: int64

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42,stratify=Y)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train[['Visitors with Patient','Admission_Deposit', 'Available Extra Rooms in Hospital']] = sc.fit_transform(X_train[['Visitors with Patient','Admission_Deposit', 'Available Extra Rooms in Hospital']])
X_test[['Visitors with Patient','Admission_Deposit', 'Available Extra Rooms in Hospital']] = sc.transform(X_test[['Visitors with Patient','Admission_Deposit', 'Available Extra Rooms in Hospital']])


In [None]:
values= [RandomForestClassifier(), KNeighborsClassifier(), LogisticRegression(), DecisionTreeClassifier(), GaussianNB(), XGBClassifier()]

keys= ['RandomForestClassifier',
      'KNeighborsClassifier',
       'LogisticRegression',
       'DecisionTreeClassifier',
       'GaussianNB',
      'XGBoost']

models= dict(zip(keys,values))
accuracy_scores=[]
train_times=[]

for key,value in models.items():
    t = time()
    value.fit(X_train,Y_train)
    duration = (time() - t) / 60
    Y_pred= value.predict(X_test)
    accuracy= accuracy_score(Y_test, Y_pred)
    accuracy_scores.append(accuracy)
    train_times.append(duration)
    print(key)
    print(round(accuracy * 100, 2))

RandomForestClassifier
56.09
KNeighborsClassifier
54.84
LogisticRegression
56.85
DecisionTreeClassifier
51.3
GaussianNB
29.97
XGBoost
60.73


In [None]:
from imblearn.over_sampling import SMOTE
smote= SMOTE()
X_sm, Y_sm = smote.fit_resample(X,Y)

In [None]:
Y_sm.value_counts()

Stay
0       154092
1       154092
2       154092
Name: count, dtype: int64

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_sm, Y_sm, test_size = 0.20, random_state = 42,stratify=Y_sm)

In [None]:
values= [RandomForestClassifier(), KNeighborsClassifier(), LogisticRegression(), DecisionTreeClassifier(), GaussianNB(), XGBClassifier()]

keys= ['RandomForestClassifier',
      'KNeighborsClassifier',
       'LogisticRegression',
       'DecisionTreeClassifier',
       'GaussianNB',
      'XGBoost']

models= dict(zip(keys,values))
accuracy_scores=[]
train_times=[]

for key,value in models.items():
    t = time()
    value.fit(X_train,Y_train)
    duration = (time() - t) / 60
    Y_pred= value.predict(X_test)
    accuracy= accuracy_score(Y_test, Y_pred)
    accuracy_scores.append(accuracy)
    train_times.append(duration)
    print(key)
    print(round(accuracy * 100, 2))

RandomForestClassifier
66.25
KNeighborsClassifier
66.29
LogisticRegression
58.87
DecisionTreeClassifier
61.46
GaussianNB
53.36
XGBoost
68.44
