In [13]:
import pandas as pd
import numpy as np

In [14]:
df=pd.read_csv('horse.csv')

In [15]:
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [17]:
df.columns

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2',
       'lesion_3', 'cp_data'],
      dtype='object')

#### The 'Outcome' column is the target column here.

## Preprocessing

### Split data into X and y

In [18]:
y=df['outcome'].copy()
X=df.drop('outcome',axis=1).copy()

### Categorizing column as Binary, Ordinal and Nominal

In [19]:
{column: list(X[column].unique()) for column in X.select_dtypes('object').columns}

{'surgery': ['no', 'yes'],
 'age': ['adult', 'young'],
 'temp_of_extremities': ['cool', nan, 'normal', 'cold', 'warm'],
 'peripheral_pulse': ['reduced', nan, 'normal', 'absent', 'increased'],
 'mucous_membrane': [nan,
  'pale_cyanotic',
  'pale_pink',
  'dark_cyanotic',
  'normal_pink',
  'bright_red',
  'bright_pink'],
 'capillary_refill_time': ['more_3_sec', 'less_3_sec', nan, '3'],
 'pain': ['extreme_pain',
  'mild_pain',
  'depressed',
  nan,
  'severe_pain',
  'alert'],
 'peristalsis': ['absent', 'hypomotile', nan, 'hypermotile', 'normal'],
 'abdominal_distention': ['severe', 'slight', 'none', nan, 'moderate'],
 'nasogastric_tube': [nan, 'none', 'slight', 'significant'],
 'nasogastric_reflux': [nan, 'less_1_liter', 'none', 'more_1_liter'],
 'rectal_exam_feces': ['decreased', 'absent', 'normal', nan, 'increased'],
 'abdomen': ['distend_large', 'other', 'normal', nan, 'firm', 'distend_small'],
 'abdomo_appearance': [nan, 'cloudy', 'serosanguious', 'clear'],
 'surgical_lesion': ['no'

### Functions to convert categorical data to numerical

In [20]:
def binary_encode(df, columns, positive_values):
    df=df.copy()
    for column, positive_value in zip(columns, positive_values):
        df[column] = df[column].apply(lambda x: 1 if x==positive_value else 0)
    return df
    
def ordinal_encode(df, columns, orderings):
    df=df.copy()
    for column, ordering in zip(columns, orderings):
        df[column] = df[column].apply(lambda x: ordering.index(x))
    return df
    
def onehot_encode(df, columns, prefixes):
    df=df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

### Categorizing all the categorical features into Binary, Ordinal and Nominal Features for better handling encoding and NULL values.

In [21]:
binary_features = [
    'surgery',
    'age',
    'surgical_lesion',
    'cp_data'
]
positive_values = [
    'yes',
    'adult',
    'yes',
    'yes'
]

ordinal_features = [
    'temp_of_extremities',
    'peripheral_pulse',
    'capillary_refill_time',
    'pain',
    'peristalsis',
    'abdominal_distention',
    'nasogastric_tube',
    'rectal_exam_feces' 
]
orderings = [
    ['cold', 'cool', 'normal', 'warm'],
    ['absent', 'reduced', 'normal', 'increased'],
    ['less_3_sec', '3', 'more_3_sec'],
    ['alert', 'depressed', 'mild_pain', 'severe_pain', 'extreme_pain'],
    ['absent', 'hypomotile', 'normal', 'hypermotile'],
    ['none', 'slight', 'moderate', 'severe'],
    ['none', 'slight', 'significant'],
    ['none', 'less_1_litre', 'more_1_litre'],
    ['absent', 'decreased', 'normal', 'increased']
]

nominal_features = [
    'mucous_membrane',
    'abdomen',
    'abdomo_appearance'
]
prefixes = [
    'MM',
    'AB',
    'AA'
]

### Handling NULL values in all columns other than nominal columns.
    - For all the categorical features except the nominal features, we will be replacing NULLs with mode of the column.
    - For all Numeric features, we will replace the NULL by Mean of the column

In [22]:
for column in df.columns:
    if column in df.select_dtypes('object').columns:
        if column not in nominal_features:
            df[column] = df[column].fillna(df[column].mode()[0])
    else:
        df[column] = df[column].fillna(df[column].mean())

In [24]:
df.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities        0
peripheral_pulse           0
mucous_membrane           47
capillary_refill_time      0
pain                       0
peristalsis                0
abdominal_distention       0
nasogastric_tube           0
nasogastric_reflux         0
nasogastric_reflux_ph      0
rectal_exam_feces          0
abdomen                  118
packed_cell_volume         0
total_protein              0
abdomo_appearance        165
abdomo_protein             0
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

#### We are now done with removing all the NULL values in all the columns except the nominal columns