In [None]:
#The dataset provided is the Horse Survival Dataset, consisting of both training and test data. 
#It comprises various features that influence the survival outcome of horses. Here's a breakdown of the columns based on my understanding:

id: Unique identifier each horse.
surgery: Indicates whether the horse underwent surgery (yes/no).
age: Age of the horse (adult/young).
hospital_number: Hospital identification number.
rectal_temp: Rectal temperature of the horse.
pulse: Pulse rate of the horse.
respiratory_rate: Respiratory rate of the horse.
temp_of_extremities: Temperature of extremities .
peripheral_pulse: Peripheral pulse status .
mucous_membrane: Condition of mucous membranes 
capillary_refill_time: Capillary refill time 
pain: Level of pain experienced by the horse.
peristalsis: Peristalsis status .
abdominal_distention: Degree of abdominal distention 
nasogastric_tube: Presence of a nasogastric tube 
nasogastric_reflux: Nasogastric reflux status 
nasogastric_reflux_ph: pH level of nasogastric reflux.
rectal_exam_feces: Results of rectal examination for feces.
abdomen: Abdominal condition
packed_cell_volume: Packed cell volume in blood.
total_protein: Total protein level in blood.
abdomo_appearance: Appearance of abdominal fluid .
abdomo_protein: Protein level in abdominal fluid.
surgical_lesion: Presence of a surgical lesion (yes/no).
lesion_1, lesion_2, lesion_3: Lesion codes.
cp_data: Is pathology data present (yes/no).
outcome: Final outcome for the horse 

In [3]:
#importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, roc_curve, auc,recall_score, f1_score
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder

In [4]:
#load train dataset
df= pd.read_csv('dtrain.csv')

In [5]:
#DATA UNDERSTANDING  print head data
df.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [6]:
# summary of dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1235 non-null   int64  
 1   surgery                1235 non-null   object 
 2   age                    1235 non-null   object 
 3   hospital_number        1235 non-null   int64  
 4   rectal_temp            1235 non-null   float64
 5   pulse                  1235 non-null   float64
 6   respiratory_rate       1235 non-null   float64
 7   temp_of_extremities    1196 non-null   object 
 8   peripheral_pulse       1175 non-null   object 
 9   mucous_membrane        1214 non-null   object 
 10  capillary_refill_time  1229 non-null   object 
 11  pain                   1191 non-null   object 
 12  peristalsis            1215 non-null   object 
 13  abdominal_distention   1212 non-null   object 
 14  nasogastric_tube       1155 non-null   object 
 15  naso

In [7]:
df.describe()

Unnamed: 0,id,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3
count,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0
mean,617.0,954500.4,38.202186,79.574089,30.054251,4.382591,49.602429,21.388016,3.290931,3832.496356,14.612146,3.577328
std,356.6581,1356403.0,0.788668,29.108638,16.452066,1.937357,10.5358,26.676453,1.589195,5436.733774,193.705735,88.858953
min,0.0,521399.0,35.4,30.0,8.0,1.0,23.0,3.5,0.1,0.0,0.0,0.0
25%,308.5,528800.0,37.8,53.0,18.0,2.0,43.0,6.6,2.0,2205.0,0.0,0.0
50%,617.0,529777.0,38.2,76.0,28.0,4.5,48.0,7.5,3.0,2209.0,0.0,0.0
75%,925.5,534145.0,38.6,100.0,36.0,6.0,57.0,9.1,4.3,3205.0,0.0,0.0
max,1234.0,5305129.0,40.8,184.0,96.0,7.5,75.0,89.0,10.1,41110.0,3112.0,2209.0


In [9]:
# Check for missing values
print(df.isnull().sum())

id                         0
surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities       39
peripheral_pulse          60
mucous_membrane           21
capillary_refill_time      6
pain                      44
peristalsis               20
abdominal_distention      23
nasogastric_tube          80
nasogastric_reflux        21
nasogastric_reflux_ph      0
rectal_exam_feces        190
abdomen                  213
packed_cell_volume         0
total_protein              0
abdomo_appearance         48
abdomo_protein             0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                    0
dtype: int64


In [13]:
#Check top for any duplicates
df['duplicate'] = df.duplicated()
df.head(30)

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome,duplicate
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,8.5,serosanguious,3.4,yes,2209,0,0,no,died,False
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized,False
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,6.4,serosanguious,3.4,yes,5124,0,0,no,lived,False
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,7.0,cloudy,3.9,yes,2208,0,0,yes,lived,False
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,7.3,cloudy,2.6,no,0,0,0,yes,lived,False
5,5,no,adult,529642,38.1,56.0,32.0,normal,normal,bright_pink,...,8.0,cloudy,2.8,no,0,0,0,yes,lived,False
6,6,yes,adult,534787,38.3,36.0,16.0,cool,reduced,normal_pink,...,75.0,cloudy,1.0,no,3111,0,0,yes,euthanized,False
7,7,no,adult,529461,39.2,114.0,24.0,cool,reduced,pale_cyanotic,...,7.6,serosanguious,4.5,yes,2207,0,0,yes,died,False
8,8,no,adult,528742,37.4,48.0,12.0,cool,reduced,normal_pink,...,7.8,cloudy,2.6,no,0,0,0,yes,lived,False
9,9,yes,adult,529640,38.3,129.0,48.0,cool,reduced,pale_pink,...,4.9,cloudy,2.9,yes,3209,0,0,yes,died,False


In [14]:
#Checkh the last any duplicates
df['duplicate'] = df.duplicated()
df.tail(30)


Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome,duplicate
1205,1205,no,adult,529461,38.2,88.0,36.0,cool,normal,normal_pink,...,8.1,clear,4.5,yes,3205,0,0,no,died,False
1206,1206,yes,adult,533836,39.1,120.0,40.0,cold,reduced,bright_pink,...,64.0,serosanguious,2.0,yes,2208,0,0,no,lived,False
1207,1207,no,adult,529461,40.3,114.0,36.0,cool,reduced,normal_pink,...,8.1,serosanguious,4.5,yes,2205,0,0,no,euthanized,False
1208,1208,yes,adult,534787,38.0,36.0,16.0,cool,,pale_pink,...,75.0,clear,2.0,yes,3111,0,0,no,euthanized,False
1209,1209,yes,young,529126,38.6,72.0,12.0,cool,normal,bright_pink,...,7.2,cloudy,2.8,yes,7111,0,0,yes,lived,False
1210,1210,no,adult,528996,38.3,60.0,20.0,cool,reduced,pale_pink,...,7.5,clear,5.0,no,0,0,0,yes,lived,False
1211,1211,yes,adult,527883,37.8,108.0,24.0,cool,reduced,pale_pink,...,6.5,serosanguious,4.1,yes,6111,0,0,yes,died,False
1212,1212,no,adult,529796,37.5,64.0,32.0,cool,reduced,normal_pink,...,7.7,clear,3.3,yes,4205,0,0,yes,lived,False
1213,1213,no,adult,530670,40.3,96.0,36.0,cool,reduced,normal_pink,...,7.5,serosanguious,4.5,yes,1400,0,0,yes,died,False
1214,1214,yes,adult,527933,38.1,60.0,18.0,,,,...,58.0,cloudy,1.0,yes,2209,0,0,yes,died,False


In [15]:
#my target col is outcome lets see the possible outcomes
target_col = 'outcome'
df[target_col].head(15)

0           died
1     euthanized
2          lived
3          lived
4          lived
5          lived
6     euthanized
7           died
8          lived
9           died
10          died
11    euthanized
12         lived
13         lived
14         lived
Name: outcome, dtype: object

In [16]:
# divide features into two types categorical/numerical
cat = df.select_dtypes(include = ["object"]).columns.to_list()
print("Categorical Features:", cat)

num = df.select_dtypes(exclude = ["object"]).columns.to_list()
print("Numerical Features:",num)

Categorical Features: ['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data', 'outcome']
Numerical Features: ['id', 'hospital_number', 'rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1', 'lesion_2', 'lesion_3', 'duplicate']


In [17]:
print("Train Data Categorical Feature Uniqueness:")
print("*" * 60)
for feature in cat:
    print('{}: Unique Count: {}\n {}\n'.format(feature, len(df[feature].unique()), df[feature].unique()))

Train Data Categorical Feature Uniqueness:
************************************************************
surgery: Unique Count: 2
 ['yes' 'no']

age: Unique Count: 2
 ['adult' 'young']

temp_of_extremities: Unique Count: 5
 ['cool' 'cold' 'normal' 'warm' nan]

peripheral_pulse: Unique Count: 5
 ['reduced' 'normal' nan 'absent' 'increased']

mucous_membrane: Unique Count: 7
 ['dark_cyanotic' 'pale_cyanotic' 'pale_pink' 'normal_pink' 'bright_pink'
 'bright_red' nan]

capillary_refill_time: Unique Count: 4
 ['more_3_sec' 'less_3_sec' nan '3']

pain: Unique Count: 7
 ['depressed' 'mild_pain' 'extreme_pain' 'alert' 'severe_pain' nan 'slight']

peristalsis: Unique Count: 6
 ['absent' 'hypomotile' 'normal' 'hypermotile' nan 'distend_small']

abdominal_distention: Unique Count: 5
 ['slight' 'moderate' 'none' 'severe' nan]

nasogastric_tube: Unique Count: 4
 ['slight' 'none' 'significant' nan]

nasogastric_reflux: Unique Count: 5
 ['less_1_liter' 'more_1_liter' 'none' nan 'slight']

rectal_exam_

In [18]:
print("Train Dataset Numerical Feature Uniqueness:")
print("*" * 60)
for feature in num:
    if len(df[feature].unique()) <= 5:
        print('{}: Unique Count: {}\n {}\n'.format(feature, len(df[feature].unique()), df[feature].unique()))
    else:
        print('{}: Unique Count: {}\n'.format(feature, len(df[feature].unique())))

Train Dataset Numerical Feature Uniqueness:
************************************************************
id: Unique Count: 1235

hospital_number: Unique Count: 255

rectal_temp: Unique Count: 43

pulse: Unique Count: 50

respiratory_rate: Unique Count: 37

nasogastric_reflux_ph: Unique Count: 26

packed_cell_volume: Unique Count: 49

total_protein: Unique Count: 83

abdomo_protein: Unique Count: 54

lesion_1: Unique Count: 57

lesion_2: Unique Count: 4
 [   0 1400 3111 3112]

lesion_3: Unique Count: 2
 [   0 2209]

duplicate: Unique Count: 1
 [False]



In [20]:
pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.
