In [1]:
#  Index  #

#  Unit 1: Import data
#  Unit 2: Clean the data for algorithm test
#  Unit 3: Correlation of objective catagory
#          -- Correlation of "mobility after 30 days" with other categories
#          -- Correlation of "Alive after 30 days with other categories"
#  Unit 4: Algorithm test
#          -- Predict "mobility after 30 days" by using 6 different classification algorithms
#          -- Predict "mobility after 30 days" by using 4 differnt regression algorithms
#          -- Predict "Alive after 30 days"    by using 4 different classification algorithms
#  Unit 5: Tuning to better results
#          -- To find the effect of the input categories for GradientBoostingClassifier algorithm 
#          -- To find the effect of the input categories for HistGradientBoostingClassifier algorithm
#          -- To see how dynamic is Randomforestclassifier algorithm in many iterations 

In [2]:
                                       ###        Unit 1: Import data       ####

In [3]:
# Import data from github #
import os
import tarfile
import pandas as pd
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/fozhong/bigdata/master/"
HOSPITAL_PATH = os.path.join(DOWNLOAD_ROOT,"hospital7.csv")     # Number of patients: 1159  patients
#HOSPITAL_PATH = os.path.join(DOWNLOAD_ROOT,"hospital6.csv")    # Number of patients: 639   patients 
def load_hospital_data(hospital_path=HOSPITAL_PATH):
 return pd.read_csv(hospital_path)
hospital_data=load_hospital_data()

In [4]:
                                       ###        Unit 2: Clean data  for algorithm test     ####

In [5]:
#Show the input data information

#hospital_data.head()
#hospital_data.describe()
#hospital_data.info()

In [6]:
###  use MinMaxScaler tool to normalize the input values to the range of [0,1] ###
from sklearn.preprocessing import MinMaxScaler        
#from sklearn.preprocessing import StandardScaler   #another normalization tool to compare which one is better
hospital_mean = MinMaxScaler()
#hospital_mean = StandardScaler()        # Another normaliztion tool to compare which is the best
hospital2=pd.DataFrame(hospital_mean.fit_transform(hospital_data.iloc[:, 0:28]))
hospital2.columns=hospital_data.columns[0:28]
hospital=pd.concat([hospital2, hospital_data.iloc[:,28:]],axis=1)

In [7]:
# show all the categories,count and data type in the file #
hospital.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159 entries, 0 to 1158
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Gender                      1159 non-null   float64
 1   Age                         1159 non-null   float64
 2   Hospital                    1159 non-null   float64
 3   Residence pre-fracture      1159 non-null   float64
 4   Mobility pre-fracture       1159 non-null   float64
 5   Pfeiffer                    1159 non-null   float64
 6   ASA                         1159 non-null   float64
 7   Fracture side               1159 non-null   float64
 8   Fracture type               1159 non-null   float64
 9   Treatment1 pre-fracture     1159 non-null   float64
 10  Treatment4 pre-fracture     1159 non-null   float64
 11  Treatment5 pre-fracture     1159 non-null   float64
 12  Mortality after surgery     1159 non-null   float64
 13  Surgery type                1159 

In [8]:
### split the data into test set and train set
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)    # 85% patients to be train set, 15%patients to be test set 
for train_index, test_index in split.split(hospital,hospital["Mobility after 30 days"]):
    strat_train_set = hospital.loc[train_index]
    strat_test_set = hospital.loc[test_index]

In [9]:
### Obejctive 1: predict the "movility after 30 days"， so we drop the object category from the train set and test set
hospital1 = strat_train_set.drop("Mobility after 30 days", axis=1)             
hospital_labels = strat_train_set["Mobility after 30 days"].copy()            
hospital_test = strat_test_set.drop("Mobility after 30 days", axis=1)             
hospital_labels_test = strat_test_set["Mobility after 30 days"].copy()            

In [10]:
### Objective 2: predict the "alive after 30 dats", so we drop the objective category from the train set and test set
hospital2 = strat_train_set.drop("Alive after 30 days", axis=1)             
hospital_labels2 = strat_train_set["Alive after 30 days"].copy()            
hospital2_test = strat_test_set.drop("Alive after 30 days", axis=1)             
hospital_labels2_test = strat_test_set["Alive after 30 days"].copy()            

In [11]:
###  For objective 1, Drop all the categories that we can not use as the input data from the train and test set
hospital_predict1 = hospital1.drop("Readmission after 30 days", axis=1).drop("Resurgery within 30 days", axis=1).drop("Alive after 30 days", axis=1)
hospital_predict1 = hospital_predict1.drop("Residence after 30 days", axis=1)
hospital_predict1_test = hospital_test.drop("Readmission after 30 days", axis=1).drop("Resurgery within 30 days", axis=1).drop("Alive after 30 days", axis=1)
hospital_predict1_test = hospital_predict1_test.drop("Residence after 30 days", axis=1)

In [12]:
###  For object 2, Drop all the categories that we can not yse as the input data from the train and test set
hospital_predict2 = hospital2.drop("Readmission after 30 days", axis=1).drop("Resurgery within 30 days", axis=1).drop("Mobility after 30 days", axis=1)
hospital_predict2 = hospital_predict2.drop("Residence after 30 days", axis=1)
hospital_predict2_test = hospital2_test.drop("Readmission after 30 days", axis=1).drop("Resurgery within 30 days", axis=1).drop("Mobility after 30 days", axis=1)
hospital_predict2_test = hospital_predict2_test.drop("Residence after 30 days", axis=1)

In [13]:
# See the input information of object 1
hospital_predict1.info()   # we can see that in the train set the number of patients are 985, and there are 28 input categories.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 1122 to 102
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Gender                      985 non-null    float64
 1   Age                         985 non-null    float64
 2   Hospital                    985 non-null    float64
 3   Residence pre-fracture      985 non-null    float64
 4   Mobility pre-fracture       985 non-null    float64
 5   Pfeiffer                    985 non-null    float64
 6   ASA                         985 non-null    float64
 7   Fracture side               985 non-null    float64
 8   Fracture type               985 non-null    float64
 9   Treatment1 pre-fracture     985 non-null    float64
 10  Treatment4 pre-fracture     985 non-null    float64
 11  Treatment5 pre-fracture     985 non-null    float64
 12  Mortality after surgery     985 non-null    float64
 13  Surgery type                985 

In [14]:
                                          ###    Unit 3: Correlation of objective catagory        ####

In [15]:
###  Correlation of "Mobility after 30 days" with other categories
corr_matrix = hospital.corr()
corr_matrix["Mobility after 30 days"].sort_values(ascending=False)

Mobility after 30 days        1.000000
Mobility pre-fracture         0.531246
Residence after 30 days       0.462404
Pfeiffer                      0.432887
ASA                           0.315186
Mortality after surgery       0.293715
Age                           0.288792
Residence pre-fracture        0.231309
Destination after hospital    0.116588
Ulcers                        0.075139
Readmission after 30 days     0.074849
Anesthesia                    0.051781
Resurgery within 30 days      0.050999
Specialist                    0.043841
Gender                        0.030985
Hospital                      0.003310
Hospital stay                -0.010479
Surgery delay                -0.015270
Anesthesia blocking          -0.020723
Treatment5 pre-fracture      -0.040517
Fracture side                -0.045640
Treatment4 pre-fracture      -0.059320
Fracture type                -0.083025
Surgery type                 -0.099595
Treatment1 pre-fracture      -0.123163
Treatment4 after hospital

In [16]:
###  Correlation of "Alive after 30 days" with other categories
corr_matrix["Alive after 30 days"].sort_values(ascending=False)

Alive after 30 days           1.000000
Treatment5 after 30 days      0.511086
Treatment4 after 30 days      0.276158
Treatment1 after 30 days      0.214957
Treatment5 after hospital     0.151312
Treatment4 after hospital     0.133537
Treatment1 after hospital     0.099290
Sat first day                 0.080616
Treatment4 pre-fracture       0.079194
Treatment1 pre-fracture       0.054167
Anesthesia blocking           0.035327
Treatment5 pre-fracture       0.030112
Hospital stay                 0.023168
Fracture side                 0.020737
Anesthesia                    0.012146
Residence pre-fracture       -0.006088
Fracture type                -0.014238
Resurgery within 30 days     -0.031333
Surgery type                 -0.041655
Surgery delay                -0.045507
Hospital                     -0.048411
Gender                       -0.053548
Ulcers                       -0.063477
ASA                          -0.112719
Age                          -0.122260
Readmission after 30 days

In [17]:
                                          ###    Unit 4: Algorithm test        ####

In [18]:
                                       ###    predict the "movility after 30 days"  ####

In [19]:
### Accuracy ###

In [20]:
###  Neural network classification algorithm ###
from sklearn.neural_network import MLPClassifier
NNclass=MLPClassifier(max_iter=3000)
NNclass.fit(hospital_predict1, hospital_labels)
from sklearn.metrics import accuracy_score
predictNNclass=NNclass.predict(hospital_predict1_test)
result_NNclass = accuracy_score(hospital_labels_test, predictNNclass)
print('Accuracy: {:.4%}'.format(result_NNclass))
#  Dynamic accuracy in the range of 55% - 61%

Accuracy: 57.4713%


In [21]:
###  LinearSVC  algorithm ###
from sklearn.svm import LinearSVC
linear_svc= LinearSVC()
linear_svc.fit(hospital_predict1,hospital_labels)
predict_linear_svc=linear_svc.predict(hospital_predict1_test)
result_linear_svc=accuracy_score(hospital_labels_test, predict_linear_svc)
print('Accuracy: {:.4%}'.format(result_linear_svc))
# Very stable accuracy 50.5747%

Accuracy: 50.5747%


In [22]:
#  SVC algorithm  #####
from sklearn.svm import SVC
svc=SVC()
svc.fit(hospital_predict1, hospital_labels)
predict_svc=svc.predict(hospital_predict1_test)
result_svc=accuracy_score(hospital_labels_test, predict_svc)
print('Accuracy:{:.4%}'.format(result_svc))
# Very stable accuracy 51.1494%

Accuracy:51.1494%


In [23]:
###   GradientBoostingClassifier algorithm  ###
hospital_predict1_optim1=hospital_predict1.drop("Hospital stay", axis=1)   # In the unit 5 we found drop this can improve the accuracy
hospital_predict1_test_optim1=hospital_predict1_test.drop("Hospital stay", axis=1)  # There is code detail in unit 5
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
gbc.fit(hospital_predict1_optim1, hospital_labels)
predict_gbc=gbc.predict(hospital_predict1_test_optim1)
result_gbc=accuracy_score(hospital_labels_test, predict_gbc)
print('Accuracy:{:.4%}'.format(result_gbc))
#  Stable accuracy, result may be 68.3908%  68.9655% or 69.5402%

Accuracy:68.9655%


In [24]:
###    HistGradientBoostingClassifier algorithm  ###
hospital_predict1_optim2=hospital_predict1.drop("Gender", axis=1)   # In the unit 5 we found drop this can improve the accuracy
hospital_predict1_test_optim2=hospital_predict1_test.drop("Gender", axis=1)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
hgbc = HistGradientBoostingClassifier()
hgbc.fit(hospital_predict1_optim2, hospital_labels)
predict_hgbc=hgbc.predict(hospital_predict1_test_optim2)
result_hgbc=accuracy_score(hospital_labels_test, predict_hgbc)
print('Accuracy:{:.4%}'.format(result_hgbc))
#  Very stable accuracy: 67.8161%              

Accuracy:67.8161%


In [25]:
###    Randomforestclassifier algorithm  ###
from sklearn.ensemble import RandomForestClassifier
ran = RandomForestClassifier()
ran.fit(hospital_predict1, hospital_labels)
predict_ran=ran.predict(hospital_predict1_test)
result_ran=accuracy_score(hospital_labels_test, predict_ran)
print('Accuracy:{:.4%}'.format(result_ran))
##  Dynamic in the range of 63% - 72%, the highest 72.4138%      # in the unit 5 we tried many iterations and get this range

Accuracy:64.9425%


In [26]:
# Conclusion of prediction "Mobility after 30 days" with classification ML algorithm
print("Conclusion: We have tried 6 different classification algorithms above to predict the 'mobility after 30 days', and we can see that the HistGradientBoostingClassifier algorithm has the best stable accuracy which is 67.2414%, GradientBoostingClassifier algorithm has three results which are:68.3908%, 68.9665%, 69.5402% and the Randomforestclassifier algorithm sometimes has the dynamic result in the range of 63-72%, but it may have the highest accuracy which is 72.4138% .") 

Conclusion: We have tried 6 different classification algorithms above to predict the 'mobility after 30 days', and we can see that the HistGradientBoostingClassifier algorithm has the best stable accuracy which is 67.2414%, GradientBoostingClassifier algorithm has three results which are:68.3908%, 68.9665%, 69.5402% and the Randomforestclassifier algorithm sometimes has the dynamic result in the range of 66-70%, but it may have the highest accuracy which is 72.4138% .


In [27]:
###    Mean absolute error ###

In [28]:
###    LinearRegression  algorithm ########
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(hospital_predict1,hospital_labels)
from sklearn.metrics import mean_absolute_error
hospital_predict_lin_test=lin_reg.predict(hospital_predict1_test)
result_lin=mean_absolute_error(hospital_labels_test, hospital_predict_lin_test)
print("Mean absolute error:", result_lin)

Mean absolute error: 1.0287779653217388


In [29]:
###     GradientBoostingRegressor  algorithm ####
from sklearn.ensemble import GradientBoostingRegressor
gbr_reg = GradientBoostingRegressor()
gbr_reg.fit(hospital_predict1, hospital_labels)
hospital_predict_gbr =gbr_reg.predict(hospital_predict1_test)
result_gbr=mean_absolute_error(hospital_labels_test, hospital_predict_gbr)
print("Mean absolute error:", result_gbr)

Mean absolute error: 0.8947962382572143


In [30]:
###    Random regressor algorithm ###
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(hospital_predict1, hospital_labels)
hospital_predictions_random_test = forest_reg.predict(hospital_predict1_test)
result_random=mean_absolute_error(hospital_labels_test, hospital_predictions_random_test)
print("Mean absolute error:", result_random)

Mean absolute error: 0.8282758620689655


In [31]:
###  SVR algorithm ####
from sklearn.svm import SVR
svr_test=SVR()
svr_test.fit(hospital_predict1, hospital_labels)
hospital_predict_svr_test=svr_test.predict(hospital_predict1_test)
result_svr=mean_absolute_error(hospital_labels_test, hospital_predict_svr_test)
print("Mean absolute error:", result_svr)

Mean absolute error: 1.0016733033708978


In [32]:
                                       ###    predict the "Alive after 30 days"  ####

In [33]:
###  Neural network classification algorithm ###
NNclass2=MLPClassifier(max_iter=1500)
NNclass2.fit(hospital_predict2, hospital_labels2)
predictNNclass2=NNclass2.predict(hospital_predict2_test)
result_NNclass2 = accuracy_score(hospital_labels2_test, predictNNclass2)
print('Accuracy: {:.4%}'.format(result_NNclass2))
# Dynamic result in the range of: 93.6% -  95.40%

Accuracy: 93.6782%


In [34]:
###  SVC algorithm  #####
svc2=SVC()
svc2.fit(hospital_predict2, hospital_labels2)
predict_svc2=svc2.predict(hospital_predict2_test)
result_svc2=accuracy_score(hospital_labels2_test, predict_svc2)
print('Accuracy:{:.4%}'.format(result_svc2))
# Stable accuracy: 94.8276%

Accuracy:94.8276%


In [35]:
###   GradientBoostingClassifier algorithm  ###
gbc2=GradientBoostingClassifier()
gbc2.fit(hospital_predict2, hospital_labels2)
predict_gbc2=gbc2.predict(hospital_predict2_test)
result_gbc2=accuracy_score(hospital_labels2_test, predict_gbc2)
print('Accuracy:{:.4%}'.format(result_gbc2))
# Stable accuracy: 97.1264%

Accuracy:97.1264%


In [36]:
###    HistGradientBoostingClassifier algorithm  ###
hgbc2 = HistGradientBoostingClassifier()
hgbc2.fit(hospital_predict2, hospital_labels2)
predict_hgbc2=hgbc2.predict(hospital_predict2_test)
result_hgbc2=accuracy_score(hospital_labels2_test, predict_hgbc2)
print('Accuracy:{:.4%}'.format(result_hgbc2))
# Stable accuracy: 96.5517%

Accuracy:96.5517%


In [37]:
# Conclusion of prediction "Alive after 30 days" with classification ML algorithm
print("Conclusion: We have tried 4 different classification algorithms above to predict the 'Alive after 30 days', and we can see that the GradientBoostingClassifier algorithm has the highest and stable accuracy which is 97.1264%.") 

Conclusion: We have tried 4 different classification algorithms above to predict the 'Alive after 30 days', and we can see that the GradientBoostingClassifier algorithm has the highest and stable accuracy which is 97.1264%.


In [38]:
                                          ###    Unit 5: Tuning to better result        ####

In [42]:
# By running the code below, we can see the effect of each input category for the GradientBoostingClassifier algorithm
max=0
min=100
for s in range(1):     # here in the () we can define how many iterations we plan to run.
  increase=[];
  equal=[];
  decrease=[];
  print("\nIteration {}".format(s+1))
  gbc=GradientBoostingClassifier()
  gbc.fit(hospital_predict1, hospital_labels)
  predict_gbc=gbc.predict(hospital_predict1_test)
  ndresult_gbc=accuracy_score(hospital_labels_test, predict_gbc)
  print('\nNo Dropping, Accuracy:{:.4%}'.format(ndresult_gbc))


  hospital_data_aux = hospital_data
  for i in hospital_data_aux.columns[:25]:
    hospital_predict1_aux = hospital_predict1
    hospital_predict1_test_aux = hospital_predict1_test
    hospital_predict1_aux = hospital_predict1_aux.drop(i, axis=1)
    hospital_predict1_test_aux = hospital_predict1_test_aux.drop(i, axis=1)
    gbc=GradientBoostingClassifier()
    gbc.fit(hospital_predict1_aux, hospital_labels)
    predict_gbc=gbc.predict(hospital_predict1_test_aux)
    result_gbc=accuracy_score(hospital_labels_test, predict_gbc)
    print('Dropping {}, Accuracy:{:.4%}'.format(i,result_gbc))
    if (result_gbc<ndresult_gbc):
      decrease.append(i)
    elif (result_gbc==ndresult_gbc):
      equal.append(i)
    else:
      increase.append(i)
  
  print("\nDropping the following parameters the Accuaracy decreases: ")  
  for param in decrease:
    print(param)  
  
  print("\nDropping the following parameters doesn't impact the Accuaracy: ")  
  for param in equal:
    print(param)

  print("\nDropping the following parameters the Accuaracy increases: ")  
  for param in increase:
    print(param)
  
  if (ndresult_gbc>max):
      max=ndresult_gbc
  if (result_gbc<min):
      min=ndresult_gbc
print('\nThe MAX Accuracy:{:.4%}'.format(max))
print('\nThe min Accuracy:{:.4%}'.format(min))
# By runing many iterations, we found it is better to drop the category of "hospital stay" to increase the accuracy


Iteration 1

No Dropping, Accuracy:68.3908%
Dropping Gender, Accuracy:66.0920%
Dropping Age, Accuracy:67.2414%
Dropping Hospital, Accuracy:66.0920%
Dropping Residence pre-fracture, Accuracy:65.5172%
Dropping Mobility pre-fracture, Accuracy:62.0690%
Dropping Pfeiffer, Accuracy:67.2414%
Dropping ASA, Accuracy:67.8161%
Dropping Fracture side, Accuracy:66.6667%
Dropping Fracture type, Accuracy:66.0920%
Dropping Treatment1 pre-fracture, Accuracy:66.6667%
Dropping Treatment4 pre-fracture, Accuracy:64.9425%
Dropping Treatment5 pre-fracture, Accuracy:67.8161%
Dropping Mortality after surgery, Accuracy:68.3908%
Dropping Surgery type, Accuracy:64.9425%
Dropping Surgery delay, Accuracy:63.7931%
Dropping Anesthesia, Accuracy:66.0920%
Dropping Anesthesia blocking, Accuracy:68.3908%
Dropping Ulcers, Accuracy:67.8161%
Dropping Specialist, Accuracy:67.2414%
Dropping Sat first day, Accuracy:66.0920%
Dropping Destination after hospital, Accuracy:67.8161%
Dropping Hospital stay, Accuracy:68.9655%
Droppi

In [43]:
# By running this code, we can see the effect of each input category for the HistGradientBoostingClassifier algorithm
max=0
min=100
for s in range(1):    # Define how many times of iteration
  increase=[];
  equal=[];
  decrease=[];
  print("\nIteration {}".format(s+1))
  hgbc = HistGradientBoostingClassifier()
  hgbc.fit(hospital_predict1, hospital_labels)
  predict_hgbc=hgbc.predict(hospital_predict1_test)
  ndresult_hgbc=accuracy_score(hospital_labels_test, predict_hgbc)
  print('\nNo Dropping, Accuracy:{:.4%}'.format(ndresult_hgbc))
  hospital_data_aux = hospital_data
  for i in hospital_data_aux.columns[:25]:
    
    hospital_predict1_aux = hospital_predict1
    hospital_predict1_test_aux = hospital_predict1_test
    hospital_predict1_aux = hospital_predict1_aux.drop(i, axis=1)
    hospital_predict1_test_aux = hospital_predict1_test_aux.drop(i, axis=1)

    hgbc = HistGradientBoostingClassifier()
    hgbc.fit(hospital_predict1_aux, hospital_labels)
    predict_hgbc=hgbc.predict(hospital_predict1_test_aux)
    result_hgbc=accuracy_score(hospital_labels_test, predict_hgbc)
    print('Dropping {}, Accuracy:{:.4%}'.format(i,result_hgbc))
    if (result_hgbc<ndresult_hgbc):
      decrease.append(i)

    elif (result_hgbc==ndresult_hgbc):
      equal.append(i)

    else:
      increase.append(i)
    
  print("\nDropping the following parameters the Accuaracy decreases: ")  
  for param in decrease:
    print(param)  
  
  print("\nDropping the following parameters doesn't impact the Accuaracy: ")  
  for param in equal:
    print(param)

  print("\nDropping the following parameters the Accuaracy increases: ")  
  for param in increase:
    print(param)
  if (ndresult_hgbc>max):
    max=ndresult_hgbc
  if (ndresult_hgbc<min):
    min=ndresult_hgbc
print('\nThe MAX Accuracy:{:.4%}'.format(max))
print('\nThe min Accuracy:{:.4%}'.format(min))
# By running many iterations, we found that the accuracy will be improved by dropping the category of Gender.


Iteration 1

No Dropping, Accuracy:67.2414%
Dropping Gender, Accuracy:67.8161%
Dropping Age, Accuracy:63.2184%
Dropping Hospital, Accuracy:63.7931%
Dropping Residence pre-fracture, Accuracy:64.9425%
Dropping Mobility pre-fracture, Accuracy:59.7701%
Dropping Pfeiffer, Accuracy:66.6667%
Dropping ASA, Accuracy:66.6667%
Dropping Fracture side, Accuracy:66.0920%
Dropping Fracture type, Accuracy:66.0920%
Dropping Treatment1 pre-fracture, Accuracy:64.3678%
Dropping Treatment4 pre-fracture, Accuracy:64.3678%
Dropping Treatment5 pre-fracture, Accuracy:65.5172%
Dropping Mortality after surgery, Accuracy:66.6667%
Dropping Surgery type, Accuracy:66.6667%
Dropping Surgery delay, Accuracy:64.9425%
Dropping Anesthesia, Accuracy:64.9425%
Dropping Anesthesia blocking, Accuracy:67.2414%
Dropping Ulcers, Accuracy:67.2414%
Dropping Specialist, Accuracy:67.2414%
Dropping Sat first day, Accuracy:67.2414%
Dropping Destination after hospital, Accuracy:66.0920%
Dropping Hospital stay, Accuracy:66.0920%
Droppi

In [41]:
###   To see how dinamic is Randomforestclassifier algorithm in many iterations ###
sum=0
num=50
max=0
min=100
for s in range(num):
  print("\nIteration {}".format(s+1))
  ran = RandomForestClassifier()
  ran.fit(hospital_predict1, hospital_labels)
  predict_ran=ran.predict(hospital_predict1_test)
  result_ran=accuracy_score(hospital_labels_test, predict_ran)
  print('Accuracy:{:.4%}'.format(result_ran))
  sum+=result_ran
  if(result_ran>max):
    max=result_ran
  if(result_ran<min):
    min=result_ran
print('\nThe MAX Accuracy:{:.4%}'.format(max))
print('The min Accuracy:{:.4%}'.format(min))
print('The mean Accuracy in {} iterations:{:.4%}'.format(num,sum/num))


Iteration 1
Accuracy:68.3908%

Iteration 2
Accuracy:67.2414%

Iteration 3
Accuracy:66.0920%

Iteration 4
Accuracy:67.2414%

Iteration 5
Accuracy:66.6667%

Iteration 6
Accuracy:67.2414%

Iteration 7
Accuracy:68.3908%

Iteration 8
Accuracy:67.8161%

Iteration 9
Accuracy:67.8161%

Iteration 10
Accuracy:69.5402%

Iteration 11
Accuracy:64.9425%

Iteration 12
Accuracy:68.3908%

Iteration 13
Accuracy:68.3908%

Iteration 14
Accuracy:70.1149%

Iteration 15
Accuracy:68.3908%

Iteration 16
Accuracy:69.5402%

Iteration 17
Accuracy:67.2414%

Iteration 18
Accuracy:65.5172%

Iteration 19
Accuracy:67.8161%

Iteration 20
Accuracy:68.9655%

Iteration 21
Accuracy:67.2414%

Iteration 22
Accuracy:67.8161%

Iteration 23
Accuracy:65.5172%

Iteration 24
Accuracy:67.2414%

Iteration 25
Accuracy:66.6667%

Iteration 26
Accuracy:67.8161%

Iteration 27
Accuracy:68.3908%

Iteration 28
Accuracy:67.8161%

Iteration 29
Accuracy:67.8161%

Iteration 30
Accuracy:68.3908%

Iteration 31
Accuracy:68.9655%

Iteration 32
Acc