## Import packages to be used

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import warnings
warnings.filterwarnings("ignore")

## Import Data

In [2]:
df = pd.read_csv("train_data.csv")

## Preview

In [3]:
df.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


## Data Preprocessing

### Checking for missing values

In [4]:
df.isnull().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

### Remove data points with missing values

In [5]:
df1 = df.dropna()

### Check if missing values are still present

In [6]:
df1.isnull().sum()

case_id                              0
Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
City_Code_Patient                    0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [7]:
df1.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

### Column selection

In [8]:
cols = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Available Extra Rooms in Hospital', 'Department', 'Ward_Type',
       'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness',
       'Visitors with Patient', 'Age', 'Admission_Deposit']

### Normalization

In [9]:
# I picked out some columns
col_cat = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital','Type of Admission',
          'Department', 'Ward_Type', 'Ward_Facility_Code', 'Age', 'Severity of Illness']

In [10]:
# I putting all the numerical columns inside a list called col_num
col_num = []
for i in cols:
    if i not in col_cat:
        col_num.append(i)

In [44]:
col_num

['Available Extra Rooms in Hospital',
 'Visitors with Patient',
 'Admission_Deposit']

In [11]:
# I created a blank. data frame to put in the newly normalized values 
df2 = pd.DataFrame()


In [45]:
# Normalization of the categorical columns 
# I created dynamic variables to save each initialization of label encoder so i could get the inverse transform easily
for i in col_cat:
    globals()['label_%s' %i.lower()] = LabelEncoder()
    df2[i] = globals()['label_%s' %i.lower()].fit_transform(df1[i].values)
    

In [43]:
# A preview of the dynamic variables that i created
label_department.classes_

array(['TB & Chest disease', 'anesthesia', 'gynecology', 'radiotherapy',
       'surgery'], dtype=object)

In [14]:
df2

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Type of Admission,Department,Ward_Type,Ward_Facility_Code,Age,Severity of Illness
0,7,2,2,0,3,2,5,5,0
1,1,2,4,1,3,3,5,5,0
2,9,4,0,1,1,3,4,5,0
3,25,1,1,1,3,2,3,5,0
4,25,1,1,1,3,3,3,5,0
...,...,...,...,...,...,...,...,...,...
313788,5,0,5,0,3,1,5,4,2
313789,23,0,0,2,1,1,4,8,2
313790,6,0,3,0,2,2,5,7,1
313791,10,1,1,1,1,1,3,1,1


In [46]:
# I created a blank data frame to save the normalization of numerical columns 
df3 = pd.DataFrame()

In [16]:
# I was getting errors trying to save the output of the normalization of numerical columns 
#so i created a new column inside the blank data frame which i am going to delete after
# normalization
df3['Age'] = df2['Age']

In [17]:
# Normalization of the numerical columns
# I created dynamic variables to save each initialization of label encoder so i could get the inverse transform easily
for i in col_num:
    globals()['std_%s' %i.lower()] = StandardScaler()
    df3[i] = globals()['std_%s' %i.lower()].fit_transform(np.array(df1[i].values).reshape(-1,1))

In [18]:
del df3['Age']

In [19]:
le_target = LabelEncoder()
df3['Stay'] = le_target.fit_transform(df1['Stay'])

### Joining normalized data

In [47]:
# I joined the normalized values of the categorical and the numerical values
df4 = pd.concat([df2,df3], axis=1)

In [21]:
df4

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Type of Admission,Department,Ward_Type,Ward_Facility_Code,Age,Severity of Illness,Available Extra Rooms in Hospital,Visitors with Patient,Admission_Deposit,Stay
0,7,2,2,0,3,2,5,5,0,-0.168178,-0.727035,0.026796,0
1,1,2,4,1,3,3,5,5,0,-1.024400,-0.727035,0.986987,4
2,9,4,0,1,1,3,4,5,0,-1.024400,-0.727035,-0.126025,3
3,25,1,1,1,3,2,3,5,0,-1.024400,-0.727035,2.200344,4
4,25,1,1,1,3,3,3,5,0,-1.024400,-0.727035,0.622427,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
313788,5,0,5,0,3,1,5,4,2,-0.168178,-0.159419,-0.679308,1
313789,23,0,0,2,1,1,4,8,2,-1.024400,0.408197,1.672838,3
313790,6,0,3,0,2,2,5,7,1,-0.168178,-0.159419,-0.595533,1
313791,10,1,1,1,1,1,3,1,1,-0.168178,0.975813,-1.031900,1


## Picking out columns for modelling

In [48]:

X = df4.iloc[:,:-1]
y = df4.iloc[:,-1]

## Decision Tree

### 10 fold cross validation analysis

In [49]:
# Here i did the 10 fold cross validation. The matrix i used were accuracy, F1, precision and recall
cv = KFold(n_splits=10, random_state=1, shuffle=True)
model = DecisionTreeClassifier(criterion = 'entropy', min_samples_leaf=20)
scoring = ['accuracy','f1_macro','precision_macro', 'recall_macro']
scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)

In [24]:
scores['test_accuracy']

array([0.37205226, 0.36883365, 0.36510516, 0.36192995, 0.36635967,
       0.36288601, 0.36741133, 0.36476624, 0.3692597 , 0.36620033])

- Conclusion

The accuracy is pretty the same for each trial, so this shows to prove that no matter the section of the data used to train the model, the accuracy will be close. The data is pretty much balanced

### Training and testing with the same set of data points

In [25]:
# Decision Tree
# We are testing and training with the same data set.
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X, y)
y_pred = decision_tree.predict(X)
acc_decision_tree = round(decision_tree.score(X, y) * 100, 2)
acc_decision_tree

99.71

In [26]:
print("The accuracy score is: " ,accuracy_score(y, y_pred), "\n")
print("The precision score is: ", precision_score(y, y_pred, pos_label='positive', average='micro'), "\n")
print("The recall score is: ", recall_score(y, y_pred, pos_label='positive', average='micro'), "\n")
print("The f1 score is: ", f1_score(y, y_pred, pos_label='positive', average='micro'))

The accuracy score is:  0.9971159331151428 

The precision score is:  0.9971159331151428 

The recall score is:  0.9971159331151428 

The f1 score is:  0.9971159331151428


- Conclusion

When the set of data used to train the model is been used to test the model, it shows high accuracy. With the results from the 10 fold cross validation analysis, we can draw conclusion that this particular model is overfitting

### Training and testing with different datasets 

In [50]:
# I split the data set into training and testing set
X_train, X_test, y_train, y_test = train_test_split( 
    X.values, y.values, test_size=0.2, random_state=0)

In [51]:
# Decision Tree
# I fitted and tested the model with two different data sets
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_test, y_test) * 100, 2)
acc_decision_tree

54.29

In [29]:
print("The accuracy score is: " ,accuracy_score(y_test, y_pred), "\n")
print("The precision score is: ", precision_score(y_test, y_pred, pos_label='positive', average='micro'), "\n")
print("The recall score is: ", recall_score(y_test, y_pred, pos_label='positive', average='micro'), "\n")
print("The f1 score is: ", f1_score(y_test, y_pred, pos_label='positive', average='micro'))

The accuracy score is:  0.28445322583215155 

The precision score is:  0.28445322583215155 

The recall score is:  0.28445322583215155 

The f1 score is:  0.28445322583215155


- Conclusion

From the results above, the accuracy is very low and other metrics so the model is under-performing, parameters need to improve.

### Decision tree Model Parameter analysis

#### Minimal number of instances permissible per leaf.

In [30]:
# We are altering the minimum sampling of leaf using 15,20 and 25 
n_leaf = [15, 20, 25]
for i in n_leaf:
    decision_tree = DecisionTreeClassifier(min_samples_leaf = i)
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    print("The accuracy on {} mininmum sample leaf is : ".format(i),round(accuracy_score(y_test, y_pred), 4) * 100, "% \n")

The accuracy on 15 mininmum sample leaf is :  35.35 % 

The accuracy on 20 mininmum sample leaf is :  36.35 % 

The accuracy on 25 mininmum sample leaf is :  36.9 % 



- Conclusion

As the minimum sample leaf is increased, the model accuracy increases, the accuracy increase also reduces, so there will be a saturation point where the accuracy will remain the same going forward

#### Splitting criteria

In [31]:
# Here we used guinea and entropy criteria to split the trees 
criterions = ["gini", "entropy"]
for i in criterions:
    decision_tree = DecisionTreeClassifier(criterion = i)
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    print("The accuracy on {} criterion is : ".format(i),round(accuracy_score(y_test, y_pred), 4) * 100, "% \n")

The accuracy on gini criterion is :  28.63 % 

The accuracy on entropy criterion is :  28.349999999999998 % 



- Conclusion

The model does better on entropy criterion

#### Max depth of the tree

In [32]:
#
max_depths = [20, 60, 110]
for i in max_depths:
    decision_tree = DecisionTreeClassifier(max_depth = i)
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    print("The accuracy on {} maximum depth is : ".format(i),round(accuracy_score(y_test, y_pred), 4) * 100, "% \n")

The accuracy on 20 maximum depth is :  34.42 % 

The accuracy on 60 maximum depth is :  28.48 % 

The accuracy on 110 maximum depth is :  28.389999999999997 % 



-  Conclusion

The accuracy reduces as the Tree depth is increased

#### Training and testing set variation

In [33]:
test_sizes = [0.1, 0.4, 0.7]
for i in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=i, random_state=0)
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    acc_decision_tree = round(decision_tree.score(X_test, y_test) * 100, 2)
    print("The accuracy on {} % training set is : ".format((1-i) * 100),round(accuracy_score(y_test, y_pred), 4) * 100, "% \n")

The accuracy on 90.0 % training set is :  28.49 % 

The accuracy on 60.0 % training set is :  28.34 % 

The accuracy on 30.000000000000004 % training set is :  28.07 % 



- Conclusion

The model accuracy reduces as the training set reduces which is common among discriminative models unlike generative models that do well with less data

## Random Forest

### Training and testing with the same set of data points

In [34]:
# Decision Tree
random_forest = RandomForestClassifier()
random_forest.fit(X, y)
y_pred = random_forest.predict(X)
acc_decision_tree = round(random_forest.score(X, y) * 100, 2)
acc_decision_tree

99.71

In [35]:
print("The accuracy score is: " ,accuracy_score(y, y_pred), "\n")
print("The precision score is: ", precision_score(y, y_pred, pos_label='positive', average='micro'), "\n")
print("The recall score is: ", recall_score(y, y_pred, pos_label='positive', average='micro'), "\n")
print("The f1 score is: ", f1_score(y, y_pred, pos_label='positive', average='micro'))

The accuracy score is:  0.9971031858581931 

The precision score is:  0.9971031858581931 

The recall score is:  0.9971031858581931 

The f1 score is:  0.9971031858581931


In [36]:
X_train, X_test, y_train, y_test = train_test_split( 
    X.values, y.values, test_size=0.2, random_state=0)

In [37]:
# Decision Tree
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest

33.48

In [38]:
print("The accuracy score is: " ,accuracy_score(y_test, y_pred), "\n")
print("The precision score is: ", precision_score(y_test, y_pred, pos_label='positive', average='micro'), "\n")
print("The recall score is: ", recall_score(y_test, y_pred, pos_label='positive', average='micro'), "\n")
print("The f1 score is: ", f1_score(y_test, y_pred, pos_label='positive', average='micro'))

The accuracy score is:  0.3347727019232301 

The precision score is:  0.3347727019232301 

The recall score is:  0.3347727019232301 

The f1 score is:  0.3347727019232301


In [39]:
n_leaf = [15, 20, 25]
for i in n_leaf:
    random_forest = RandomForestClassifier(min_samples_leaf = i)
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_test)
    print("The accuracy on {} mininmum smaple leaf is : ".format(i),round(accuracy_score(y_test, y_pred), 4) * 100, "% \n")

The accuracy on 15 mininmum smaple leaf is :  40.300000000000004 % 

The accuracy on 20 mininmum smaple leaf is :  40.339999999999996 % 

The accuracy on 25 mininmum smaple leaf is :  40.400000000000006 % 



In [40]:
criterions = ["gini", "entropy"]
for i in criterions:
    random_forest = RandomForestClassifier(criterion = i)
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_test)
    print("The accuracy on {} criterion is : ".format(i),round(accuracy_score(y_test, y_pred), 4) * 100, "% \n")

The accuracy on gini criterion is :  33.45 % 

The accuracy on entropy criterion is :  33.339999999999996 % 



In [41]:
max_depths = [20, 60, 110]
for i in max_depths:
    random_forest = RandomForestClassifier(max_depth = i)
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_test)
    print("The accuracy on {} maximum depth is : ".format(i),round(accuracy_score(y_test, y_pred), 4) * 100, "% \n")

The accuracy on 20 maximum depth is :  38.6 % 

The accuracy on 60 maximum depth is :  33.37 % 

The accuracy on 110 maximum depth is :  33.19 % 



In [42]:
test_sizes = [0.1, 0.4, 0.7]
for i in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=i, random_state=0)
    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_test)
    print("The accuracy on {} training set is : ".format((1-i) * 100),round(accuracy_score(y_test, y_pred), 4) * 100, "% \n")

The accuracy on 90.0 training set is :  33.31 % 

The accuracy on 60.0 training set is :  33.36 % 

The accuracy on 30.000000000000004 training set is :  33.72 % 

