In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from time import time
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import pickle

from sklearn.model_selection import train_test_split


warnings.filterwarnings('ignore')
np.random.seed(42)

In [2]:
data = pd.read_csv('hcare.csv')

In [3]:
print(f'Dataset shape: {data.shape}')

Dataset shape: (318438, 18)


In [4]:
data.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [5]:
data.dtypes

case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     int64
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
patientid                              int64
City_Code_Patient                    float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
Stay                                  object
dtype: object

In [6]:
data.isnull().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

Removing case_id, patient and City_Code_Patient ID as they are irrelevant to the patient's length of stay. Two pairs, "Hospital_region_code" and "City_Code_Hospital" as well as "Hospital_code" and "Hospital_type_code," we found high mutual information, so we will discard Hospital_code and City_Code_Hospital as well.



In [7]:
data.drop(['case_id', 'patientid', 'City_Code_Patient', 'Hospital_code', 'City_Code_Hospital'  ], axis=1, inplace=True)

Removing the missing instance of the Bed Grade feature. 


In [8]:
data.dropna(subset = ['Bed Grade'], inplace = True)
data.isnull().sum()

Hospital_type_code                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [9]:
print(f'Duplicates in the dataset: {data.duplicated().sum()}')
print(f'Percentage of duplicates: {data.duplicated().sum()/len(data)*100}%')

Duplicates in the dataset: 428
Percentage of duplicates: 0.13445378151260504%


In [10]:
data = data.drop_duplicates()
print(f'Duplicates in the dataset: {data.duplicated().sum()}')
print(f'Percentage of duplicates: {data.duplicated().sum()/len(data)*100}%')

Duplicates in the dataset: 0
Percentage of duplicates: 0.0%


In [11]:
data['Stay'].replace ('More than 100 Days', '>100', inplace=True)

In [12]:
for feature in data.columns: 
    if data[feature].dtype == object:
        print('\nFeature:',feature)
        print(pd.Categorical(data[feature].unique()))


Feature: Hospital_type_code
['c', 'e', 'b', 'a', 'f', 'd', 'g']
Categories (7, object): ['a', 'b', 'c', 'd', 'e', 'f', 'g']

Feature: Hospital_region_code
['Z', 'X', 'Y']
Categories (3, object): ['X', 'Y', 'Z']

Feature: Department
['radiotherapy', 'anesthesia', 'gynecology', 'TB & Chest disease', 'surgery']
Categories (5, object): ['TB & Chest disease', 'anesthesia', 'gynecology', 'radiotherapy', 'surgery']

Feature: Ward_Type
['R', 'S', 'Q', 'P', 'T', 'U']
Categories (6, object): ['P', 'Q', 'R', 'S', 'T', 'U']

Feature: Ward_Facility_Code
['F', 'E', 'D', 'B', 'A', 'C']
Categories (6, object): ['A', 'B', 'C', 'D', 'E', 'F']

Feature: Type of Admission
['Emergency', 'Trauma', 'Urgent']
Categories (3, object): ['Emergency', 'Trauma', 'Urgent']

Feature: Severity of Illness
['Extreme', 'Moderate', 'Minor']
Categories (3, object): ['Extreme', 'Minor', 'Moderate']

Feature: Age
['51-60', '71-80', '31-40', '41-50', '81-90', '61-70', '21-30', '11-20', '0-10', '91-100']
Categories (10, objec

In [13]:
data.dtypes

Hospital_type_code                    object
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
Stay                                  object
dtype: object

In [14]:
data['Bed Grade'] = data['Bed Grade'].astype(object)

In [15]:
cat_col = list(data.select_dtypes(include=['object']).columns)
label_enc_col = ['Bed Grade','Type of Admission','Severity of Illness', 'Age','Stay']
one_hot_enc_col = list(set(cat_col) - set(label_enc_col))

In [16]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
for col in label_enc_col:
    data[col] = label.fit_transform(data[col])


data = pd.get_dummies(data=data, columns=one_hot_enc_col, dtype=int)


In [17]:
data.reset_index(drop=True, inplace=True)  # reset index


In [18]:
X = data.drop('Stay', axis=1)
Y = data[['Stay']]

In [19]:
(Y['Stay'].value_counts())

2     87268
1     78008
3     55092
5     34974
0     23580
4     11732
7     10242
10     6664
8      4835
9      2763
6      2739
Name: Stay, dtype: int64

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote= SMOTE()
X_sm, Y_sm = smote.fit_resample(X,Y)

In [None]:
Y_sm.value_counts()

Stay
0       20493
1       20493
2       20493
3       20493
4       20493
5       20493
6       20493
7       20493
8       20493
9       20493
10      20493
dtype: int64

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_sm, Y_sm, test_size = 0.20, random_state = 42,stratify=Y_sm)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train[['Visitors with Patient','Admission_Deposit', 'Available Extra Rooms in Hospital']] = sc.fit_transform(X_train[['Visitors with Patient','Admission_Deposit', 'Available Extra Rooms in Hospital']])
X_test[['Visitors with Patient','Admission_Deposit', 'Available Extra Rooms in Hospital']] = sc.transform(X_test[['Visitors with Patient','Admission_Deposit', 'Available Extra Rooms in Hospital']])


In [None]:
values= [RandomForestClassifier(), KNeighborsClassifier(), LogisticRegression(), DecisionTreeClassifier(), GaussianNB(), XGBClassifier()]

keys= ['RandomForestClassifier',
      'KNeighborsClassifier', 
       'LogisticRegression', 
       'DecisionTreeClassifier', 
       'GaussianNB',
      'XGBoost']

models= dict(zip(keys,values))
accuracy_scores=[]
train_times=[]

for key,value in models.items():
    t = time()
    value.fit(X_train,Y_train)
    duration = (time() - t) / 60
    Y_pred= value.predict(X_test)
    accuracy= accuracy_score(Y_test, Y_pred)
    accuracy_scores.append(accuracy)
    train_times.append(duration)   
    print(key)
    print(round(accuracy * 100, 2))

RandomForestClassifier
69.28
KNeighborsClassifier
28.4
LogisticRegression
9.09
DecisionTreeClassifier
59.37
GaussianNB
26.46
XGBoost
57.88


In [None]:
#from bayes_opt import BayesianOptimization
!pip install -U bayesian-optimization
from bayes_opt import BayesianOptimization, UtilityFunction
from sklearn.model_selection import cross_val_score
#from bayes_opt.util import UtilityFunction


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
%%time
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization


# Define the search space
def rf_bo(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Convert to integers
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    # Define the model
    rf = RandomForestClassifier(n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf,
                                 random_state=42)
    # Evaluate the model with cross-validation
    cv_scores = cross_val_score(rf, X_train, Y_train, cv=5, scoring='accuracy')
    return cv_scores.mean()

# Define the search space with parameter bounds
pbounds = {'n_estimators': (10, 200),
           'max_depth': (2, 10),
           'min_samples_split': (2, 20),
           'min_samples_leaf': (1, 10)}

# Instantiate the optimizer
rf_bopt = BayesianOptimization(f=rf_bo, pbounds=pbounds, random_state=42)

# Perform the optimization
rf_bopt.maximize(n_iter=50, init_points=10)

# Get the best hyperparameters
best_params = rf_bopt.max['params']

# Train the model with the best hyperparameters
best_rf = RandomForestClassifier(n_estimators=int(best_params['n_estimators']),
                                  max_depth=int(best_params['max_depth']),
                                  min_samples_split=int(best_params['min_samples_split']),
                                  min_samples_leaf=int(best_params['min_samples_leaf']),
                                  random_state=42)
best_rf.fit(X_train, Y_train)

# Predict with the best model
y_pred = best_rf.predict(X_train)


|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.3173   [0m | [0m4.996    [0m | [0m9.556    [0m | [0m15.18    [0m | [0m123.7    [0m |
| [0m2        [0m | [0m0.3038   [0m | [0m3.248    [0m | [0m2.404    [0m | [0m3.046    [0m | [0m174.6    [0m |
| [95m3        [0m | [95m0.359    [0m | [95m6.809    [0m | [95m7.373    [0m | [95m2.371    [0m | [95m194.3    [0m |
| [95m4        [0m | [95m0.3883   [0m | [95m8.66     [0m | [95m2.911    [0m | [95m5.273    [0m | [95m44.85    [0m |
| [0m5        [0m | [0m0.3178   [0m | [0m4.434    [0m | [0m5.723    [0m | [0m9.775    [0m | [0m65.33    [0m |
| [0m6        [0m | [0m0.3566   [0m | [0m6.895    [0m | [0m2.255    [0m | [0m7.259    [0m | [0m79.61    [0m |
| [0m7        [0m | [0m0.3359   [0m | [0m5.649    [0m | [0m8.067    [0m | [0m5.594    [0m | [0m10

In [None]:
best_params['max_depth'] = int(round(best_params['max_depth']))


In [None]:
best_params

{'max_depth': 10,
 'min_samples_leaf': 1.0,
 'min_samples_split': 2.0,
 'n_estimators': 150.3619503529204}

In [None]:
# best_params = optimizer.max['params']
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest classifier with the best hyperparameters
rf_clf = RandomForestClassifier(**best_params)

# Fit the classifier to the entire training set
rf_clf.fit(X_train, Y_train)

# Make predictions on the test set
y_pred = rf_clf.predict(X_test)
print(accuracy_score(Y_test, y_pred))


InvalidParameterError: ignored