In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('horse.csv')

In [3]:
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [4]:
horse_df=df.drop(['outcome'],axis=1)

In [5]:
target=df['outcome']

In [6]:
horse_df.columns

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3',
       'cp_data'],
      dtype='object')

In [7]:
horse_df[horse_df.duplicated()]

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data


In [8]:
horse_df.isna().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [9]:
horse_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [10]:
horse_df.shape

(299, 27)

In [11]:
horse_df.drop(['hospital_number'],axis=1,inplace=True)

In [12]:
horse_df_numerical_features = horse_df.select_dtypes(include='number')
horse_df_categorical_features = horse_df.select_dtypes(include='object')

In [13]:
horse_df_numerical_features.head()

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3
0,38.5,66.0,28.0,,45.0,8.4,,11300,0,0
1,39.2,88.0,20.0,,50.0,85.0,2.0,2208,0,0
2,38.3,40.0,24.0,,33.0,6.7,,0,0,0
3,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208,0,0
4,37.3,104.0,35.0,,74.0,7.4,,4300,0,0


In [14]:
horse_df_numerical_features.shape

(299, 10)

In [15]:
horse_df_numerical_features.columns

Index(['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph',
       'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1',
       'lesion_2', 'lesion_3'],
      dtype='object')

In [16]:
#let as impute numerical features with median value
from sklearn.impute import SimpleImputer
imp_median= SimpleImputer(missing_values=np.nan, strategy='median')

In [17]:
num_df=pd.DataFrame(imp_median.fit_transform(horse_df_numerical_features))
num_df.columns=horse_df_numerical_features.columns

In [18]:
num_df.head()

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3
0,38.5,66.0,28.0,5.0,45.0,8.4,2.3,11300.0,0.0,0.0
1,39.2,88.0,20.0,5.0,50.0,85.0,2.0,2208.0,0.0,0.0
2,38.3,40.0,24.0,5.0,33.0,6.7,2.3,0.0,0.0,0.0
3,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208.0,0.0,0.0
4,37.3,104.0,35.0,5.0,74.0,7.4,2.3,4300.0,0.0,0.0


In [19]:
horse_df_categorical_features.head()

Unnamed: 0,surgery,age,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,rectal_exam_feces,abdomen,abdomo_appearance,surgical_lesion,cp_data
0,no,adult,cool,reduced,,more_3_sec,extreme_pain,absent,severe,,,decreased,distend_large,,no,no
1,yes,adult,,,pale_cyanotic,less_3_sec,mild_pain,absent,slight,,,absent,other,cloudy,no,no
2,no,adult,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,,,normal,normal,,no,yes
3,yes,young,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,none,less_1_liter,decreased,,serosanguious,yes,yes
4,no,adult,,,dark_cyanotic,more_3_sec,,,,,,,,,no,no


In [20]:
#let as impute categorical features with mode
imp_mode= SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [21]:
cat_df=pd.DataFrame(imp_mode.fit_transform(horse_df_categorical_features))
cat_df.columns=horse_df_categorical_features.columns

In [22]:
cat_df.head()

Unnamed: 0,surgery,age,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,rectal_exam_feces,abdomen,abdomo_appearance,surgical_lesion,cp_data
0,no,adult,cool,reduced,normal_pink,more_3_sec,extreme_pain,absent,severe,slight,none,decreased,distend_large,cloudy,no,no
1,yes,adult,cool,normal,pale_cyanotic,less_3_sec,mild_pain,absent,slight,slight,none,absent,other,cloudy,no,no
2,no,adult,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,slight,none,normal,normal,cloudy,no,yes
3,yes,young,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,none,less_1_liter,decreased,distend_large,serosanguious,yes,yes
4,no,adult,cool,normal,dark_cyanotic,more_3_sec,mild_pain,hypomotile,none,slight,none,absent,distend_large,cloudy,no,no


In [23]:
def detect_outliers_iqr(data):
    outlier_list = []
    data = sorted(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    #print("The Val of Q1 and Q2",q1, q3)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    #print("The lower & Upper Bound",lwr_bound, upr_bound)
    
    for i in data: 
        if (i<lwr_bound or i>upr_bound):
            outlier_list.append(i)
    return outlier_list # Driver code



for i in ['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph',
       'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1',
       'lesion_2', 'lesion_3']:
    outliers = detect_outliers_iqr(num_df[i])
    print("Outliers in",i,"attribute :", outliers)

Outliers in rectal_temp attribute : [35.4, 36.0, 36.1, 36.4, 36.5, 36.5, 36.6, 36.8, 36.9, 39.5, 39.5, 39.5, 39.5, 39.6, 39.7, 39.9, 40.0, 40.3, 40.3, 40.8]
Outliers in pulse attribute : [150.0, 150.0, 160.0, 164.0, 184.0]
Outliers in respiratory_rate attribute : [58.0, 60.0, 60.0, 60.0, 60.0, 66.0, 68.0, 68.0, 68.0, 70.0, 70.0, 72.0, 80.0, 80.0, 80.0, 84.0, 84.0, 88.0, 90.0, 90.0, 96.0, 96.0]
Outliers in nasogastric_reflux_ph attribute : [1.0, 1.0, 1.5, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.5, 4.0, 4.0, 4.0, 4.3, 4.4, 4.5, 4.5, 4.5, 5.3, 5.4, 5.5, 5.5, 5.5, 5.5, 5.7, 6.0, 6.2, 6.5, 6.5, 6.5, 6.5, 6.5, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.2, 7.5, 7.5]
Outliers in packed_cell_volume attribute : [67.0, 68.0, 68.0, 68.0, 68.0, 69.0, 69.0, 70.0, 71.0, 72.0, 73.0, 73.0, 74.0, 75.0, 75.0]
Outliers in total_protein attribute : []
Outliers in abdomo_protein attribute : [0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.3, 1.4, 1

In [24]:
def handle_outliers(data):

    tenth_percentile = np.percentile(data, 10)
    ninetieth_percentile = np.percentile(data, 90)

    b = np.where(data<tenth_percentile, tenth_percentile, data)

    b1 = np.where(b>ninetieth_percentile, ninetieth_percentile, b)
    return b1
    
    
for i in ['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph',
       'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1',
       'lesion_2', 'lesion_3']:
    num_df[i]=handle_outliers(num_df[i])


In [25]:
#feature scaling on numerical columns
from sklearn.preprocessing import StandardScaler
# -infinity to infinity
# z = (X - mean)/std deviation

scale = StandardScaler()
num_df[['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph',
       'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1',
       'lesion_2', 'lesion_3']]=scale.fit_transform(num_df[['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph',
       'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1',
       'lesion_2', 'lesion_3']])


In [26]:
cat_df=pd.get_dummies(cat_df)

In [27]:
cat_df.shape

(299, 56)

In [28]:
cat_df.head()

Unnamed: 0,surgery_no,surgery_yes,age_adult,age_young,temp_of_extremities_cold,temp_of_extremities_cool,temp_of_extremities_normal,temp_of_extremities_warm,peripheral_pulse_absent,peripheral_pulse_increased,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
1,0,1,1,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0
2,1,0,1,0,0,0,1,0,0,0,...,0,1,0,0,1,0,1,0,0,1
3,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
4,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0


In [29]:
new_horse_df=pd.concat([num_df,cat_df],axis=1)

In [30]:
new_horse_df.head()

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,0.767667,-0.164801,0.031693,0.0,-0.091058,-0.538908,-0.279938,2.054163,0.0,0.0,...,0,0,0,0,1,0,1,0,1,0
1,1.706451,0.79881,-0.75799,0.0,0.540647,1.833083,-0.925449,-0.297853,0.0,0.0,...,0,0,1,0,1,0,1,0,1,0
2,0.298276,-1.216014,-0.363149,0.0,-1.354468,-0.60772,-0.279938,-1.357052,0.0,0.0,...,0,1,0,0,1,0,1,0,0,1
3,1.706451,1.850022,2.0059,0.0,0.287965,-0.587481,2.560314,-0.297853,0.0,0.0,...,0,0,0,0,0,1,0,1,0,1
4,-1.62623,1.499618,0.722666,0.0,1.804057,-0.579386,-0.279938,0.705699,0.0,0.0,...,0,0,0,0,1,0,1,0,1,0


In [31]:
new_horse_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 66 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   rectal_temp                       299 non-null    float64
 1   pulse                             299 non-null    float64
 2   respiratory_rate                  299 non-null    float64
 3   nasogastric_reflux_ph             299 non-null    float64
 4   packed_cell_volume                299 non-null    float64
 5   total_protein                     299 non-null    float64
 6   abdomo_protein                    299 non-null    float64
 7   lesion_1                          299 non-null    float64
 8   lesion_2                          299 non-null    float64
 9   lesion_3                          299 non-null    float64
 10  surgery_no                        299 non-null    uint8  
 11  surgery_yes                       299 non-null    uint8  
 12  age_adul

In [32]:
target.shape

(299,)

In [33]:
target.value_counts()

lived         178
died           77
euthanized     44
Name: outcome, dtype: int64

In [34]:
#apply smote
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
new_horse_df,target= oversample.fit_resample(new_horse_df,target)

In [45]:
target.value_counts()

died          178
euthanized    178
lived         178
Name: outcome, dtype: int64

In [35]:
#train test split
from sklearn.model_selection import train_test_split


In [36]:
X_train, X_test, y_train, y_test = train_test_split(new_horse_df,target, test_size=0.20, random_state=42)

In [37]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(427, 66)
(427,)
(107, 66)
(107,)


In [38]:
target.value_counts()

died          178
euthanized    178
lived         178
Name: outcome, dtype: int64

In [39]:
#buildimg machine learning pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

In [40]:
model_pipeline=[]
model_pipeline.append(LogisticRegression())
model_pipeline.append(DecisionTreeClassifier())
model_pipeline.append(RandomForestClassifier())
model_pipeline.append(SVC())
model_pipeline.append(KNeighborsClassifier())
model_pipeline.append(GaussianNB())
model_pipeline.append(BernoulliNB())

In [41]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_curve,auc

In [42]:
model_list=['Logistic Regression','Decision Tree','Random Forest','SVM','KNN','GuassianNB','BernoulliNB']
test_acc_list=[]
train_acc_list=[]

cm_list=[]


for model in model_pipeline:
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    y_train_pred=model.predict(X_train)
    test_acc_list.append(accuracy_score(y_test,y_pred))
    train_acc_list.append(accuracy_score(y_train,y_train_pred))
    
    
    cm_list.append(confusion_matrix(y_test,y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
result_df=pd.DataFrame({'model':model_list,'test_accuracy':test_acc_list,'train_accuracy':train_acc_list})

In [44]:
result_df

Unnamed: 0,model,test_accuracy,train_accuracy
0,Logistic Regression,0.785047,0.868852
1,Decision Tree,0.813084,1.0
2,Random Forest,0.859813,1.0
3,SVM,0.831776,0.946136
4,KNN,0.785047,0.875878
5,GuassianNB,0.598131,0.681499
6,BernoulliNB,0.747664,0.770492
