In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder #The features are converted to ordinal integers
from sklearn.model_selection import cross_val_score #Evaluation of score by cross validation
from sklearn.model_selection import KFold #KFold divides all the samples in  k groups of samples, called folds ,of equal sizes (if possible). The prediction function is learned using  folds, and the fold left out is used for test.
from sklearn.preprocessing import StandardScaler #Standardize features by removing the mean and scaling to unit variance
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression #Logistic regression is a statistical analysis method to predict a binary outcome, such as yes or no, based on prior observations of a data set.

df=pd.read_csv("horse.csv")
df

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,...,55.0,65.0,,,euthanized,no,3205,0,0,no
295,no,adult,527702,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,...,44.0,,serosanguious,3.3,euthanized,yes,2208,0,0,yes
296,yes,adult,529386,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,...,60.0,6.8,,,died,yes,3205,0,0,no
297,yes,adult,530612,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,...,50.0,6.0,serosanguious,3.4,lived,yes,2208,0,0,yes


In [3]:
df.columns

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2',
       'lesion_3', 'cp_data'],
      dtype='object')

In [4]:
#Find the null count in each feature
df.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [5]:
#Dropped the features with highest number of null values and then for the remaining took out the statistics to see if any of those null values could be replaced but through the information about parameter and statistics found the prediction through such filled null values would be misleading hence removed those features and related features as well
df = df.drop(['pulse','respiratory_rate','lesion_1','lesion_2','lesion_3','hospital_number','nasogastric_reflux_ph','nasogastric_tube','nasogastric_reflux','abdomen','abdomo_protein','abdomo_appearance','rectal_exam_feces','cp_data'], axis=1)
df.columns

Index(['surgery', 'age', 'rectal_temp', 'temp_of_extremities',
       'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain',
       'peristalsis', 'abdominal_distention', 'packed_cell_volume',
       'total_protein', 'outcome', 'surgical_lesion'],
      dtype='object')

In [6]:
df.isnull().sum()

surgery                   0
age                       0
rectal_temp              60
temp_of_extremities      56
peripheral_pulse         69
mucous_membrane          47
capillary_refill_time    32
pain                     55
peristalsis              44
abdominal_distention     56
packed_cell_volume       29
total_protein            33
outcome                   0
surgical_lesion           0
dtype: int64

In [7]:
df.dtypes

surgery                   object
age                       object
rectal_temp              float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
packed_cell_volume       float64
total_protein            float64
outcome                   object
surgical_lesion           object
dtype: object

In [8]:
#Finding categorical data in each column
df.nunique()

surgery                   2
age                       2
rectal_temp              40
temp_of_extremities       4
peripheral_pulse          4
mucous_membrane           6
capillary_refill_time     3
pain                      5
peristalsis               4
abdominal_distention      4
packed_cell_volume       50
total_protein            80
outcome                   3
surgical_lesion           2
dtype: int64

In [9]:
df

Unnamed: 0,surgery,age,rectal_temp,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,packed_cell_volume,total_protein,outcome,surgical_lesion
0,no,adult,38.5,cool,reduced,,more_3_sec,extreme_pain,absent,severe,45.0,8.4,died,no
1,yes,adult,39.2,,,pale_cyanotic,less_3_sec,mild_pain,absent,slight,50.0,85.0,euthanized,no
2,no,adult,38.3,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,33.0,6.7,lived,no
3,yes,young,39.1,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,48.0,7.2,died,yes
4,no,adult,37.3,,,dark_cyanotic,more_3_sec,,,,74.0,7.4,died,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,,cold,,pale_cyanotic,more_3_sec,depressed,absent,,55.0,65.0,euthanized,no
295,no,adult,37.2,cool,increased,pale_cyanotic,more_3_sec,severe_pain,hypomotile,moderate,44.0,,euthanized,yes
296,yes,adult,37.5,cold,reduced,pale_cyanotic,less_3_sec,severe_pain,absent,moderate,60.0,6.8,died,yes
297,yes,adult,36.5,cool,reduced,pale_pink,less_3_sec,mild_pain,hypomotile,moderate,50.0,6.0,lived,yes


In [10]:
#Statistical data about the parameters that have float data type
df.describe()

Unnamed: 0,rectal_temp,packed_cell_volume,total_protein
count,239.0,270.0,266.0
mean,38.168619,46.307407,24.274436
std,0.733744,10.436743,27.364194
min,35.4,23.0,3.3
25%,37.8,38.0,6.5
50%,38.2,45.0,7.5
75%,38.5,52.0,56.75
max,40.8,75.0,89.0


In [11]:
df.columns

Index(['surgery', 'age', 'rectal_temp', 'temp_of_extremities',
       'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain',
       'peristalsis', 'abdominal_distention', 'packed_cell_volume',
       'total_protein', 'outcome', 'surgical_lesion'],
      dtype='object')

In [12]:
df.isnull().sum()

surgery                   0
age                       0
rectal_temp              60
temp_of_extremities      56
peripheral_pulse         69
mucous_membrane          47
capillary_refill_time    32
pain                     55
peristalsis              44
abdominal_distention     56
packed_cell_volume       29
total_protein            33
outcome                   0
surgical_lesion           0
dtype: int64

In [13]:
df.dtypes

surgery                   object
age                       object
rectal_temp              float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
packed_cell_volume       float64
total_protein            float64
outcome                   object
surgical_lesion           object
dtype: object

In [14]:
#Filling the null values with mean values since we have continuos data
df['rectal_temp'].fillna(value=38.2, inplace=True)
df['packed_cell_volume'].fillna(value=45, inplace=True)
df['total_protein'].fillna(value=7.5, inplace=True)

In [15]:
df.isnull().sum()

surgery                   0
age                       0
rectal_temp               0
temp_of_extremities      56
peripheral_pulse         69
mucous_membrane          47
capillary_refill_time    32
pain                     55
peristalsis              44
abdominal_distention     56
packed_cell_volume        0
total_protein             0
outcome                   0
surgical_lesion           0
dtype: int64

In [16]:
#We earlier found the unique categories present in some columns but the data type was object so here after identifying categories present in each feature we encoded it with a float value to change object datatype to float datatype
def enco(df):
    
    df['temp_of_extremities'] = df['temp_of_extremities'].map({'normal':2,'warm':1,'cool':4,'cold':5,'NaN':0}).astype('float64')
    df['temp_of_extremities']=df['temp_of_extremities'].replace(np.nan, 3)

    df['peripheral_pulse'] = df['peripheral_pulse'].map({'increased':1,'normal':2,'reduced':4,'absent':5,'NaN':0}).astype('float64')
    df['peripheral_pulse']=df['peripheral_pulse'].replace(np.nan, 3)

    df['mucous_membrane'] = df['mucous_membrane'].map({'dark_cyanotic':7,'bright_red':6,'pale_cyanotic':5,'pale_pink':3,'bright_pink':2,'normal_pink':1,'NaN':0}).astype('float64')
    df['mucous_membrane']=df['mucous_membrane'].replace(np.nan, 4)

    df['capillary_refill_time'] = df['capillary_refill_time'].map({'more_3_sec':3, '3':2, 'less_3_sec':1,'NaN':0}).astype('float64')
    df['capillary_refill_time']=df['capillary_refill_time'].replace(np.nan, 2)

    df['pain'] = df['pain'].map({'extreme_pain':6, 'severe_pain':5, 'mild_pain':4, 'depressed':2, 'alert':1,'NaN':0}).astype('float64')
    df['pain']=df['pain'].replace(np.nan, 3)

    df['peristalsis'] = df['peristalsis'].map({'absent':5, 'hypomotile':4, 'normal':2, 'hypermotile':1,'NaN':0}).astype('float64')
    df['peristalsis'] = df['peristalsis'].replace(np.nan,3)

    df['abdominal_distention'] = df['abdominal_distention'].map({'severe':5,'moderate':4,'slight':2,'none':1,'NaN':0}).astype('float64')
    df['abdominal_distention']=df['abdominal_distention'].replace(np.nan, 3)

    df['surgical_lesion']=df['surgical_lesion'].map({'yes':1,'no':0}).astype('float64') 

    df['surgery'] = df['surgery'].map({'yes':1,'no':2,'NaN':0}).astype('float64')

    df['age'] = df['age'].map({'adult':1,'young':2,'NaN':0}).astype('float64')
    
    df['outcome'] = df['outcome'].map({'euthanized':3, 'died':2, 'lived':1}).astype('float64')

    return df

In [17]:
enco(df)

Unnamed: 0,surgery,age,rectal_temp,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,packed_cell_volume,total_protein,outcome,surgical_lesion
0,2.0,1.0,38.5,4.0,4.0,4.0,3.0,6.0,5.0,5.0,45.0,8.4,2.0,0.0
1,1.0,1.0,39.2,3.0,3.0,5.0,1.0,4.0,5.0,2.0,50.0,85.0,3.0,0.0
2,2.0,1.0,38.3,2.0,2.0,3.0,1.0,4.0,4.0,1.0,33.0,6.7,1.0,0.0
3,1.0,2.0,39.1,5.0,2.0,7.0,3.0,2.0,5.0,5.0,48.0,7.2,2.0,1.0
4,2.0,1.0,37.3,3.0,3.0,7.0,3.0,3.0,3.0,3.0,74.0,7.4,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1.0,1.0,38.2,5.0,3.0,5.0,3.0,2.0,5.0,3.0,55.0,65.0,3.0,0.0
295,2.0,1.0,37.2,4.0,1.0,5.0,3.0,5.0,4.0,4.0,44.0,7.5,3.0,1.0
296,1.0,1.0,37.5,5.0,4.0,5.0,1.0,5.0,5.0,4.0,60.0,6.8,2.0,1.0
297,1.0,1.0,36.5,4.0,4.0,3.0,1.0,4.0,4.0,4.0,50.0,6.0,1.0,1.0


In [18]:
#All the columns that represent each of the parameters responsible for the horse's health are our input whereas the "outcome" is our target
Selected_features = ['surgery', 'age', 'rectal_temp', 'temp_of_extremities',
       'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain',
       'peristalsis', 'abdominal_distention', 'packed_cell_volume',
       'total_protein', 'outcome', 'surgical_lesion']
X = df[Selected_features]
y = df['outcome']
#We have randomly split the dataset in training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [19]:
df.dtypes

surgery                  float64
age                      float64
rectal_temp              float64
temp_of_extremities      float64
peripheral_pulse         float64
mucous_membrane          float64
capillary_refill_time    float64
pain                     float64
peristalsis              float64
abdominal_distention     float64
packed_cell_volume       float64
total_protein            float64
outcome                  float64
surgical_lesion          float64
dtype: object

In [20]:
#Implemented the LogisticRegression Model on this dataset
classifier = LogisticRegression(random_state=0) 
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test) 
confusion_matrix = confusion_matrix(y_test, y_pred)

#Checking the accuracy parameters of this put model
print(confusion_matrix)

print('Score: {:.2f}'.format(classifier.score(X_test,y_test)))

[[40  1  0]
 [ 0 18  0]
 [ 0  1 15]]
Score: 0.97


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Through this experiment we created a logistic regression model that could predict if a horse would make it or not depending on his various parameter readings .
# We studied the various features , their relevance and completed EDA
#We are able to achieve 97% accuracy through this model.