# Problem Statement

Predict the survival of a horse based on various observed medical conditions.
Load the data "horse.csv" and check whether it contains any missing values.
The dataset contains many categorical features; replace them with label encoding. 
Replace the missing values with the most frequent value in each column.
Fit a Decision Tree Classifier and Random Florest Classifier and observe the accuracy.
You are supposed to fit a Decision Tremm and compare its accuracy with Random Forest Classifier.

In [137]:
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [138]:
df = pd.read_csv('Datasets/horse.csv')
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [140]:
# Select and transform in an list only columns identify by object 
list(df.select_dtypes(include=[object]).columns)

['surgery',
 'age',
 'temp_of_extremities',
 'peripheral_pulse',
 'mucous_membrane',
 'capillary_refill_time',
 'pain',
 'peristalsis',
 'abdominal_distention',
 'nasogastric_tube',
 'nasogastric_reflux',
 'rectal_exam_feces',
 'abdomen',
 'abdomo_appearance',
 'outcome',
 'surgical_lesion',
 'cp_data']

In [141]:
target = df['outcome']

In [142]:
target.unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [143]:
df = df.drop('outcome', axis = 1)

In [144]:
category_variables = ['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane',
 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data']

df = pd.get_dummies(df, columns=category_variables)

In [145]:
df.head()

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,530101,38.5,66.0,28.0,,45.0,8.4,,11300,0,...,False,False,False,False,False,False,True,False,True,False
1,534817,39.2,88.0,20.0,,50.0,85.0,2.0,2208,0,...,False,False,True,False,True,False,True,False,True,False
2,530334,38.3,40.0,24.0,,33.0,6.7,,0,0,...,False,True,False,False,False,False,True,False,False,True
3,5290409,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208,0,...,False,False,False,False,False,True,False,True,False,True
4,530255,37.3,104.0,35.0,,74.0,7.4,,4300,0,...,False,False,False,False,False,False,True,False,True,False


In [146]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X, y = df.values, target.values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state = 1 )

In [147]:
from sklearn.tree import DecisionTreeClassifier
print(X_train.shape)

(239, 67)


In [148]:
from sklearn.impute import SimpleImputer
import numpy as np

imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

In [149]:
classifier = DecisionTreeClassifier()

In [150]:
classifier.fit(X_train, y_train)

In [151]:
y_predict = classifier.predict(X_test)

In [152]:
from sklearn.metrics import accuracy_score

In [153]:
accuracy = accuracy_score(y_predict, y_test)

In [154]:
print(accuracy)

0.6166666666666667


In [155]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

In [156]:
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
accuracy = accuracy_score(y_predict, y_test)
print(accuracy)

0.75


# Quick Recap

1 - Import Libraries and dataset

2 - Preprocess the data using get dummies and label encolders

3 - Import the missing values using the most frequent values

4 - Fit Decision Tree Classifier on the transformed data

5 - Print the accuracy

6 - Fit Random Forest Classifier on the transformed data

7 - Print the accuracy of the Random Forest Classifier 
