In [83]:
"""
Created on Tue Jul 10 13:46:33 2018
https://www.kaggle.com/uciml/indian-liver-patient-records
@author: devp
"""
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['indian_liver_patient.csv']


In [84]:
filepath = '../input/indian_liver_patient.csv'

In [85]:
data = pd.read_csv(filepath)

In [86]:
data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


**We need to separate the target values from the rest of the table**

In [87]:
X = data.iloc[:,:-1].values
t = data.iloc[:,-1].values

Output variable (target)

**1** means **having liver disease**

**2** means **not having liver disease**

***We need to convert all 2's into zeroes for confusion-matrix calulations***

In [88]:
for u in range(len(t)):
    if t[u] == 2:
        t[u] = 0

**Gender column has entries as Male and Female.  For a mathematical model to learn, we have to encode these into numbers.**

In [89]:
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()
X[:,1] = lbl.fit_transform(X[:,1])

In [90]:
data.isnull().any()

Age                           False
Gender                        False
Total_Bilirubin               False
Direct_Bilirubin              False
Alkaline_Phosphotase          False
Alamine_Aminotransferase      False
Aspartate_Aminotransferase    False
Total_Protiens                False
Albumin                       False
Albumin_and_Globulin_Ratio     True
Dataset                       False
dtype: bool

**Let's check how many entries have a NaN (Not a Number) or missing values**

In [91]:
data['Albumin_and_Globulin_Ratio'].isnull().sum()

4

In [92]:
missing_values_rows = data[data.isnull().any(axis=1)]
print(missing_values_rows)

     Age  Gender   ...     Albumin_and_Globulin_Ratio  Dataset
209   45  Female   ...                            NaN        1
241   51    Male   ...                            NaN        1
253   35  Female   ...                            NaN        0
312   27    Male   ...                            NaN        0

[4 rows x 11 columns]


**Fill the missing rows with values**

*Here we fill it by **median** of the values of that corresponding column*

In [93]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
X[:,9:10] = imp.fit_transform(X[:,9:10])



Let's partition our dataset into **training data** and **testing data**

Here, we keep 25% data as testing data.

In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, t_train, t_test = train_test_split(X,t,random_state=0,test_size=0.25)

**Feature Scaling**

**Standardisation** is applied to all rows of all columns **except the age and the gender column**.


In [95]:
from sklearn. preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,2:] = sc.fit_transform(X_train[:,2:])
X_test[:,2:] = sc.transform(X_test[:,2:])



Importing Model Evaluation metrics

In [96]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

**Traning and Predictions**

**Logistic Regression**

In [97]:
from sklearn.linear_model import LogisticRegression
# creating object of LogisticRegression class
classifier_logis = LogisticRegression(random_state=0)
# fitting the model/ training the model on training data (X_train,t_train)
classifier_logis.fit(X_train,t_train)
# predicting whether the points (people/rows) in the test set (X_test) have the liver disease or not
y_pred_logis = classifier_logis.predict(X_test)
# evaluating model performance by confusion-matrix
cm_logis = confusion_matrix(t_test,y_pred_logis)
print(cm_logis)
# accuracy-result of LogisticRegression model
accuracy_logis = accuracy_score(t_test,y_pred_logis)
print('The accuracy of LogisticRegression is : ', str(accuracy_logis*100) , '%')

[[ 9 37]
 [ 7 93]]
The accuracy of LogisticRegression is :  69.86301369863014 %


**Support Vector Machine - Classification**

In [98]:
from sklearn.svm import SVC
# creating object of SVC class
classifier_svc = SVC(kernel='rbf', random_state=0, gamma='auto')
# fitting the model/ training the model on training data (X_train,t_train)
classifier_svc.fit(X_train,t_train)
# predicting whether the points (people/rows) in the test set (X_test) have the liver disease or not
y_pred_svc = classifier_svc.predict(X_test)
# evaluating model performance by confusion-matrix
cm_svc = confusion_matrix(t_test,y_pred_svc)
print(cm_svc)
# accuracy-result of SVC model
accuracy_svc = accuracy_score(t_test,y_pred_svc)
print('The accuracy of SupportVectorClassification is : ', str(accuracy_svc*100) , '%')

[[ 4 42]
 [ 5 95]]
The accuracy of SupportVectorClassification is :  67.8082191780822 %


**Random Forest Classification**

In [99]:
from sklearn.ensemble import RandomForestClassifier
# creating object of RandomForestClassifier class
classifier_rfc = RandomForestClassifier(n_estimators=250, criterion='entropy',random_state=0 )
# fitting the model/ training the model on training data (X_train,t_train)
classifier_rfc.fit(X_train,t_train)
# predicting whether the points (people/rows) in the test set (X_test) have the liver disease or not
y_pred_rfc = classifier_rfc.predict(X_test)
# evaluating model performance by confusion-matrix
cm_rfc = confusion_matrix(t_test,y_pred_rfc)
print(cm_rfc)
# accuracy-result of RandomForestClassifier model
accuracy_rfc = accuracy_score(t_test,y_pred_rfc)
print('The accuracy of RandomForestClassifier is : ', str(accuracy_rfc*100) , '%')

[[14 32]
 [11 89]]
The accuracy of RandomForestClassifier is :  70.54794520547945 %
