# Predicting Horse Survival

## 1. Let’s attempt to predict the survival of a horse based on various observed medical conditions. Load the data from ‘horses.csv’ and observe whether it contains missing values.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = 30

In [2]:
df = pd.read_csv('horse.csv')
df

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,extreme_pain,absent,severe,,,,decreased,distend_large,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,mild_pain,absent,slight,,,,absent,other,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,,,,normal,normal,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,none,less_1_liter,5.0,decreased,,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,,,,,,,,,74.0,7.4,,,died,no,4300,0,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,depressed,absent,,,,,,distend_large,55.0,65.0,,,euthanized,no,3205,0,0,no
295,no,adult,527702,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,severe_pain,hypomotile,moderate,significant,none,,absent,distend_small,44.0,,serosanguious,3.3,euthanized,yes,2208,0,0,yes
296,yes,adult,529386,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,severe_pain,absent,moderate,slight,none,,decreased,distend_large,60.0,6.8,,,died,yes,3205,0,0,no
297,yes,adult,530612,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,mild_pain,hypomotile,moderate,significant,none,,absent,distend_small,50.0,6.0,serosanguious,3.4,lived,yes,2208,0,0,yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [4]:
print(f"Null values by column:\n{df.isnull().sum()}")

Null values by column:
surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64


## 2. This dataset contains many categorical features, replace them with label encoding.

In [5]:
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,extreme_pain,absent,severe,,,,decreased,distend_large,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,mild_pain,absent,slight,,,,absent,other,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,,,,normal,normal,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,none,less_1_liter,5.0,decreased,,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,,,,,,,,,74.0,7.4,,,died,no,4300,0,0,no


In [6]:
X = df.drop('outcome', axis=1)
y = df.outcome
X

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,extreme_pain,absent,severe,,,,decreased,distend_large,45.0,8.4,,,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,mild_pain,absent,slight,,,,absent,other,50.0,85.0,cloudy,2.0,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,,,,normal,normal,33.0,6.7,,,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,none,less_1_liter,5.0,decreased,,48.0,7.2,serosanguious,5.3,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,,,,,,,,,74.0,7.4,,,no,4300,0,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,depressed,absent,,,,,,distend_large,55.0,65.0,,,no,3205,0,0,no
295,no,adult,527702,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,severe_pain,hypomotile,moderate,significant,none,,absent,distend_small,44.0,,serosanguious,3.3,yes,2208,0,0,yes
296,yes,adult,529386,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,severe_pain,absent,moderate,slight,none,,decreased,distend_large,60.0,6.8,,,yes,3205,0,0,no
297,yes,adult,530612,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,mild_pain,hypomotile,moderate,significant,none,,absent,distend_small,50.0,6.0,serosanguious,3.4,yes,2208,0,0,yes


In [7]:
X = pd.get_dummies(X)
X

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3,surgery_no,surgery_yes,age_adult,age_young,...,rectal_exam_feces_decreased,rectal_exam_feces_increased,rectal_exam_feces_normal,abdomen_distend_large,abdomen_distend_small,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,530101,38.5,66.0,28.0,,45.0,8.4,,11300,0,0,1,0,1,0,...,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,534817,39.2,88.0,20.0,,50.0,85.0,2.0,2208,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0
2,530334,38.3,40.0,24.0,,33.0,6.7,,0,0,0,1,0,1,0,...,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1
3,5290409,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1
4,530255,37.3,104.0,35.0,,74.0,7.4,,4300,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,533886,,120.0,70.0,,55.0,65.0,,3205,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
295,527702,37.2,72.0,24.0,,44.0,,3.3,2208,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1
296,529386,37.5,72.0,30.0,,60.0,6.8,,3205,0,0,0,1,1,0,...,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0
297,530612,36.5,100.0,24.0,,50.0,6.0,3.4,2208,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1


In [8]:
y = y.map({'lived': 0, 'died': 1, 'euthanized': 2})

## 3. Replace the missing values by the most frequent value in each column.[Hint: Refer to Imputer class in Scikit learn preprocessing module]

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
imp = SimpleImputer(strategy='most_frequent')
imp.fit(X)

SimpleImputer(strategy='most_frequent')

In [11]:
columns = X.columns.values
columns

array(['hospital_number', 'rectal_temp', 'pulse', 'respiratory_rate',
       'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein',
       'abdomo_protein', 'lesion_1', 'lesion_2', 'lesion_3', 'surgery_no',
       'surgery_yes', 'age_adult', 'age_young',
       'temp_of_extremities_cold', 'temp_of_extremities_cool',
       'temp_of_extremities_normal', 'temp_of_extremities_warm',
       'peripheral_pulse_absent', 'peripheral_pulse_increased',
       'peripheral_pulse_normal', 'peripheral_pulse_reduced',
       'mucous_membrane_bright_pink', 'mucous_membrane_bright_red',
       'mucous_membrane_dark_cyanotic', 'mucous_membrane_normal_pink',
       'mucous_membrane_pale_cyanotic', 'mucous_membrane_pale_pink',
       'capillary_refill_time_3', 'capillary_refill_time_less_3_sec',
       'capillary_refill_time_more_3_sec', 'pain_alert', 'pain_depressed',
       'pain_extreme_pain', 'pain_mild_pain', 'pain_severe_pain',
       'peristalsis_absent', 'peristalsis_hypermotile',
       

In [12]:
X = pd.DataFrame(imp.transform(X), columns=columns)
#df2.columns = df.columns.copy()
X

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3,surgery_no,surgery_yes,age_adult,age_young,...,rectal_exam_feces_decreased,rectal_exam_feces_increased,rectal_exam_feces_normal,abdomen_distend_large,abdomen_distend_small,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,530101.0,38.5,66.0,28.0,2.0,45.0,8.4,2.0,11300.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,534817.0,39.2,88.0,20.0,2.0,50.0,85.0,2.0,2208.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,530334.0,38.3,40.0,24.0,2.0,33.0,6.7,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,5290409.0,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,530255.0,37.3,104.0,35.0,2.0,74.0,7.4,2.0,4300.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,533886.0,38.0,120.0,70.0,2.0,55.0,65.0,2.0,3205.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
295,527702.0,37.2,72.0,24.0,2.0,44.0,6.5,3.3,2208.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
296,529386.0,37.5,72.0,30.0,2.0,60.0,6.8,2.0,3205.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
297,530612.0,36.5,100.0,24.0,2.0,50.0,6.0,3.4,2208.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [13]:
y

0      1
1      2
2      0
3      1
4      1
      ..
294    2
295    2
296    1
297    0
298    2
Name: outcome, Length: 299, dtype: int64

## 4. Fit a decision tree classifier and observe the accuracy.

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [16]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [17]:
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score

In [18]:
print(classification_report(y_test, y_pred))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       0.84      0.68      0.75        53
           1       0.58      0.64      0.61        28
           2       0.44      0.78      0.56         9

    accuracy                           0.68        90
   macro avg       0.62      0.70      0.64        90
weighted avg       0.72      0.68      0.69        90

Accuracy Score: 0.6777777777777778


## 5.Fit a random forest classifier and observe the accuracy.

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [20]:
print(classification_report(y_test, y_pred))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       0.71      0.98      0.83        53
           1       0.87      0.46      0.60        28
           2       0.50      0.11      0.18         9

    accuracy                           0.73        90
   macro avg       0.69      0.52      0.54        90
weighted avg       0.74      0.73      0.69        90

Accuracy Score: 0.7333333333333333


### The Random Forest Classifier seems to be doing a better job classifying our data correctly.