In [None]:
"""
Objective:
    • Learn to handle missing values
    • Learn to fit a decision tree and compare its accuracy with a random forest 
classifier. 
"""

In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [4]:
"""
Let's attempt to predict the survival of a horse based on various observed 
medical conditions. Load the data from 'horses.csv' and observe whether it 
contains missing values.
[Hint: Pandas dataframe has a method isnull]
"""
CSV_PATH = r'D:\CourseWork\data-science-python-certification-course\Assignments\07 Supervised Learning - 1\Case Study II\resources\horse.csv'
df = pd.read_csv(CSV_PATH)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [5]:
# Fit-Transform
imp = SimpleImputer(strategy="most_frequent")
ft = lambda label: imp.fit_transform(df[label].values.reshape(-1, 1))

# For Categorical Fit-Transform 
le = LabelEncoder()
cle = lambda label: le.fit_transform(df[label])

"""
This dataset contains many categorical features, replace them with label 
encoding.
[Hint: Refer to get_dummies methods in pandas dataframe or Label encoder in 
scikit-learn]
"""

# Categorical nulls filling
df['temp_of_extremities'] = ft('temp_of_extremities')
df['peripheral_pulse'] = ft('peripheral_pulse')
df['mucous_membrane'] = ft('mucous_membrane')
df['capillary_refill_time'] = ft('capillary_refill_time')
df['pain'] = ft('pain')
df['peristalsis'] = ft('peristalsis')
df['abdominal_distention'] = ft('abdominal_distention')
df['nasogastric_tube'] = ft('nasogastric_tube')
df['nasogastric_reflux'] = ft('nasogastric_reflux')
df['rectal_exam_feces'] = ft('rectal_exam_feces')
df['abdomen'] = ft('abdomen')
df['abdomo_appearance'] = ft('abdomo_appearance')

# Encoding
df['temp_of_extremities'] = cle('temp_of_extremities')
df['peripheral_pulse'] = cle('peripheral_pulse')
df['mucous_membrane'] = cle('mucous_membrane')
df['capillary_refill_time'] = cle('capillary_refill_time')
df['pain'] = cle('pain')
df['peristalsis'] = cle('peristalsis')
df['abdominal_distention'] = cle('abdominal_distention')
df['nasogastric_tube'] = cle('nasogastric_tube')
df['nasogastric_reflux'] = cle('nasogastric_reflux')
df['rectal_exam_feces'] = cle('rectal_exam_feces')
df['abdomen'] = cle('abdomen')
df['abdomo_appearance'] = cle('abdomo_appearance')
df['outcome'] = cle('outcome')
df['surgery'] = cle('surgery')
df['age'] = cle('age')
df['cp_data'] = cle('cp_data')
df['surgical_lesion'] = cle('surgical_lesion')


"""
Replace the missing values with the most frequent value in each column.
[Hint: Refer to Imputer class in Scikit learn preprocessing module]
"""

# Numeric
df['rectal_temp'] = ft('rectal_temp')
df['pulse'] = ft('pulse')
df['respiratory_rate'] = ft('respiratory_rate')
df['nasogastric_reflux_ph'] = ft('nasogastric_reflux_ph')
df['packed_cell_volume'] = ft('packed_cell_volume')
df['total_protein'] = ft('total_protein')
df['abdomo_protein'] = ft('abdomo_protein')

df.head(5)

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,0,0,530101,38.5,66.0,28.0,1,3,3,2,...,45.0,8.4,1,2.0,0,0,11300,0,0,0
1,1,0,534817,39.2,88.0,20.0,1,2,4,1,...,50.0,85.0,1,2.0,1,0,2208,0,0,0
2,0,0,530334,38.3,40.0,24.0,2,2,5,1,...,33.0,6.7,1,2.0,2,0,0,0,0,1
3,1,1,5290409,39.1,164.0,84.0,0,2,2,2,...,48.0,7.2,2,5.3,0,1,2208,0,0,1
4,0,0,530255,37.3,104.0,35.0,1,2,2,2,...,74.0,7.4,1,2.0,0,0,4300,0,0,0


In [6]:
# Preparing x and y before applying classifiers
x = df.drop('outcome', axis=1)
y = df['outcome']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35, random_state=157)

In [8]:
# Fit a decision tree classifier and observe the accuracy.
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dtc_predicted = dtc.predict(x_test)
metrics.accuracy_score(y_test, dtc_predicted)

0.5714285714285714

In [9]:
# Fit a random forest classifier and observe the accuracy.
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_predicted = rfc.predict(x_test)
metrics.accuracy_score(y_test, rfc_predicted)

0.6476190476190476