# Indian Liver Patients

Examine the data of patients living in North East of Andhra Pradesh, India with and without liver disease. The dataset is from the UCI Machine Learning Repository.

In [202]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import functions to compute accuarcy and split data
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score

# Import models, including VotingClassifier meta-model
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier

# Set seed for reproducibility 
SEED = 1

## Exploratory Data Analysis

Get to know what is included in the dataset

In [203]:
df = pd.read_csv('Indian_Liver_Patient_Dataset.csv')
df.head()

Unnamed: 0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
0,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
1,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
2,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
3,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1
4,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.3,1


We see here that the attribute names are not in the dataset, so we need to add them. They are listed on the UCI page:

Attribute Information:

1. Age of the patient
2. Gender of the patient
3. TB (Total Bilirubin)
4. DB (Direct Bilirubin)
5. Alkphos Alkaline Phosphotase
6. Sgpt Alamine Aminotransferase
7. Sgot Aspartate Aminotransferase
8. TP (Total Protiens)
9. ALB (Albumin)
10. A/G (Ratio Albumin and Globulin Ratio)
11. Selector field used to split the data into two sets (labeled by the experts)

In [204]:
df.columns = ['Age','Gender','TB','DB','AAP','Sgpt Alamine Aminotransferase','Sgot Aspartate Aminotransferase','TP','ALB','A_G','Selector']

In [205]:
df.head()

Unnamed: 0,Age,Gender,TB,DB,AAP,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,TP,ALB,A_G,Selector
0,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
1,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
2,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
3,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1
4,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.3,1


In [206]:
df = pd.read_csv('Indian_Liver_Patient_Dataset.csv',names=['Age','Gender','TB','DB','AAP','SGPT','SGOT','TP','ALB','A_G','Liver_Disease'])
#df.reset_index(df, level=None, drop=False, inplace=False, col_level=0, col_fill='')

# always remember to do the df =, becuase otherwise you are not saving it 
df = df.rename_axis('Patient_ID').reset_index()
#df.reset_index().set_index('Unnamed', drop=False)

df.head()

Unnamed: 0,Patient_ID,Age,Gender,TB,DB,AAP,SGPT,SGOT,TP,ALB,A_G,Liver_Disease
0,0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


Okay, good, have the correct first row.

The selctor column is whether the patient has liver disease. The 2 refers to no, or False, so will be changed to a 0.

To keep track of the patients, will be assinging the index number as an ID.

In [207]:
df.shape

(583, 12)

In [208]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 12 columns):
Patient_ID       583 non-null int64
Age              583 non-null int64
Gender           583 non-null object
TB               583 non-null float64
DB               583 non-null float64
AAP              583 non-null int64
SGPT             583 non-null int64
SGOT             583 non-null int64
TP               583 non-null float64
ALB              583 non-null float64
A_G              579 non-null float64
Liver_Disease    583 non-null int64
dtypes: float64(5), int64(6), object(1)
memory usage: 54.7+ KB


# Need to preprocess the data before anything can be done with it.

This means that the null values need to be taken care of. There are only 4 (583 vs 579), so it is probably okay to just drop the rows that have them.

The Gender column needs to be transformed into a number.

The other columns have numbers that are all at different magnitudes, therefore they will need to be standardized. Standardization is chosen so that we can compare the features with different scales and units.

In [209]:
df = df.dropna()

In [210]:
from sklearn.preprocessing import LabelEncoder

gender_encoder = LabelEncoder()
gender_categories = df['Gender'].unique()

print('Gender categories are: \n',gender_categories,'\n')

# A subtlety here is that the encoder is being fit first to these gender cats, then it can transform the gender column
encoded_gender_cats = gender_encoder.fit_transform(gender_categories)

df['is_male'] = gender_encoder.transform(df['Gender'])
#df['Gender'] = gender_encoder.transform(df['Gender'])

# duh, need to reassign it to the column, instead of just putting it there
df['Liver_Disease'] = df['Liver_Disease'].replace(to_replace = 2,value=0)

df.head(10)

Gender categories are: 
 ['Female' 'Male'] 



Unnamed: 0,Patient_ID,Age,Gender,TB,DB,AAP,SGPT,SGOT,TP,ALB,A_G,Liver_Disease,is_male
0,0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1,0
1,1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1,1
2,2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1,1
3,3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1,1
4,4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1,1
5,5,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.3,1,1
6,6,26,Female,0.9,0.2,154,16,12,7.0,3.5,1.0,1,0
7,7,29,Female,0.9,0.3,202,14,11,6.7,3.6,1.1,1,0
8,8,17,Male,0.9,0.3,202,22,19,7.4,4.1,1.2,0,1
9,9,55,Male,0.7,0.2,290,53,58,6.8,3.4,1.0,1,1


In [211]:
df = df.drop('Gender',axis=1)

df.head(20)

Unnamed: 0,Patient_ID,Age,TB,DB,AAP,SGPT,SGOT,TP,ALB,A_G,Liver_Disease,is_male
0,0,65,0.7,0.1,187,16,18,6.8,3.3,0.9,1,0
1,1,62,10.9,5.5,699,64,100,7.5,3.2,0.74,1,1
2,2,62,7.3,4.1,490,60,68,7.0,3.3,0.89,1,1
3,3,58,1.0,0.4,182,14,20,6.8,3.4,1.0,1,1
4,4,72,3.9,2.0,195,27,59,7.3,2.4,0.4,1,1
5,5,46,1.8,0.7,208,19,14,7.6,4.4,1.3,1,1
6,6,26,0.9,0.2,154,16,12,7.0,3.5,1.0,1,0
7,7,29,0.9,0.3,202,14,11,6.7,3.6,1.1,1,0
8,8,17,0.9,0.3,202,22,19,7.4,4.1,1.2,0,1
9,9,55,0.7,0.2,290,53,58,6.8,3.4,1.0,1,1


Time to standardize the columns so we can actually look at some modeling.

In [212]:
features = np.array(df.columns)
print(features)
features = np.delete(features,10)
print(features)

X = df[features]
print('The X data frame looks like this: \n', X.shape,'\n',X.head())

y = df['Liver_Disease']
print('The y data looks like this: \n',y.shape,'\n',y.head())


['Patient_ID' 'Age' 'TB' 'DB' 'AAP' 'SGPT' 'SGOT' 'TP' 'ALB' 'A_G'
 'Liver_Disease' 'is_male']
['Patient_ID' 'Age' 'TB' 'DB' 'AAP' 'SGPT' 'SGOT' 'TP' 'ALB' 'A_G'
 'is_male']
The X data frame looks like this: 
 (579, 11) 
    Patient_ID  Age    TB   DB  AAP  SGPT  SGOT   TP  ALB   A_G  is_male
0           0   65   0.7  0.1  187    16    18  6.8  3.3  0.90        0
1           1   62  10.9  5.5  699    64   100  7.5  3.2  0.74        1
2           2   62   7.3  4.1  490    60    68  7.0  3.3  0.89        1
3           3   58   1.0  0.4  182    14    20  6.8  3.4  1.00        1
4           4   72   3.9  2.0  195    27    59  7.3  2.4  0.40        1
The y data looks like this: 
 (579,) 
 0    1
1    1
2    1
3    1
4    1
Name: Liver_Disease, dtype: int64


In [213]:

standard_features = np.delete(features,0)
standard_features = np.delete(standard_features,9)

print(standard_features)

['Age' 'TB' 'DB' 'AAP' 'SGPT' 'SGOT' 'TP' 'ALB' 'A_G']


In [214]:
# Standardizing the X data, hopefully this does what we want re: columns

standardize_X = X
print(standardize_X.head())

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
standardize_X[standard_features] = scaler.fit_transform(standardize_X[standard_features])
#standardize_X = pd.DataFrame(standardize_X,columns=features)

# don't want Patient ID or Liver_Disease to be standardized, so reassign those to original values
#standardize_X[['Patient_ID','is_male']] = X[['Patient_ID','is_male']]
#something is happening with 4 values

#standardize_X['Patient_ID'] = X['Patient_ID']
#standardize_X['is_male'] = X['is_male']
standardize_X.head(20)

   Patient_ID  Age    TB   DB  AAP  SGPT  SGOT   TP  ALB   A_G  is_male
0           0   65   0.7  0.1  187    16    18  6.8  3.3  0.90        0
1           1   62  10.9  5.5  699    64   100  7.5  3.2  0.74        1
2           2   62   7.3  4.1  490    60    68  7.0  3.3  0.89        1
3           3   58   1.0  0.4  182    14    20  6.8  3.4  1.00        1
4           4   72   3.9  2.0  195    27    59  7.3  2.4  0.40        1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,Patient_ID,Age,TB,DB,AAP,SGPT,SGOT,TP,ALB,A_G,is_male
0,0,1.247403,-0.42032,-0.495414,-0.42887,-0.355832,-0.319111,0.293722,0.203446,-0.14739,0
1,1,1.062306,1.218936,1.423518,1.675083,-0.093573,-0.035962,0.939655,0.077462,-0.648461,1
2,2,1.062306,0.640375,0.926017,0.816243,-0.115428,-0.146459,0.478274,0.203446,-0.178707,1
3,3,0.815511,-0.372106,-0.388807,-0.449416,-0.36676,-0.312205,0.293722,0.329431,0.16578,1
4,4,1.679294,0.093956,0.179766,-0.395996,-0.295731,-0.177537,0.755102,-0.930414,-1.713237,1
5,5,0.075125,-0.243537,-0.282199,-0.342575,-0.339441,-0.332923,1.031931,1.589276,1.105288,1
6,6,-1.15885,-0.388178,-0.459878,-0.564476,-0.355832,-0.33983,0.478274,0.455416,0.16578,0
7,7,-0.973754,-0.388178,-0.424343,-0.367231,-0.36676,-0.343283,0.201446,0.5814,0.478949,0
8,8,-1.71414,-0.388178,-0.424343,-0.367231,-0.32305,-0.315658,0.847378,1.211323,0.792118,1
9,9,0.630415,-0.42032,-0.459878,-0.005614,-0.153674,-0.18099,0.293722,0.329431,0.16578,1


In [215]:
print(standardize_X.info())
print(X.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 582
Data columns (total 11 columns):
Patient_ID    579 non-null int64
Age           579 non-null float64
TB            579 non-null float64
DB            579 non-null float64
AAP           579 non-null float64
SGPT          579 non-null float64
SGOT          579 non-null float64
TP            579 non-null float64
ALB           579 non-null float64
A_G           579 non-null float64
is_male       579 non-null int64
dtypes: float64(9), int64(2)
memory usage: 54.3 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 582
Data columns (total 11 columns):
Patient_ID    579 non-null int64
Age           579 non-null float64
TB            579 non-null float64
DB            579 non-null float64
AAP           579 non-null float64
SGPT          579 non-null float64
SGOT          579 non-null float64
TP            579 non-null float64
ALB           579 non-null float64
A_G           579 non-null float64
is_ma

In [216]:
# make sure to use the standarized data

# splitting the data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(standardize_X,y,test_size=0.3,random_state=SEED)

In [217]:
from sklearn.tree import DecisionTreeClassifier
# Instantiate Logistic Regression
lr = LogisticRegression(random_state=SEED)

# Instantiate knn
knn = KNN(n_neighbors=27)

# Instantiate dt (each leaf contains at least 13% of the data)
dt = DecisionTreeClassifier(min_samples_leaf=0.13,random_state=SEED)

# Define list of classifiers
classifiers = [('Logistic Regression',lr),('K Nearest Neighbors',knn),('Classification Tree',dt)]

In [218]:
# Iterate over the classifiers
for clf_name, clf in classifiers:
    
    # fit clf to training set
    clf.fit(X_train,y_train)
    
    # predict y_pred
    y_pred = clf.predict(X_test)
    
    # compute accuracy
    accuracy = accuracy_score(y_test,y_pred)
    
    # Evaluate clf's accuracy
    print('{:s}:{:.3f}'.format(clf_name,accuracy))

Logistic Regression:0.747
K Nearest Neighbors:0.724
Classification Tree:0.730




We see that the accuarcy of the three different classfiers is slightly different, ranging from 0.747 to 0.724. Let's see if we can do better by using a **Voting Classifier**. The Voting Classifier takes outputs of models defined in classifiers list and assigns labels based on a majority vote.

In [219]:
# Now to put it all together using a Voting Classifier

vc = VotingClassifier(estimators=classifiers)
 
# fit the data
vc.fit(X_train,y_train)

# evaluate test predictions
y_pred = vc.predict(X_test)

# calculate accuaracy score
accuracy = accuracy_score(y_test,y_pred)
print('Voting Classifier: {:.3f}'.format(accuracy))

Voting Classifier: 0.753




The Voting Classifier has increased the accuracy from the three classifiers to 75.3%. This is larger than the value achieved by Logistic Regression at 74.7%.

## Now we will try Bagging on this dataset

In [220]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)

Let's re-split the data into 80% train and 20% test

In [221]:
X_train, X_test, y_train, y_test = train_test_split(standardize_X,y,test_size=0.2,random_state=SEED)

In [222]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 463 entries, 445 to 37
Data columns (total 11 columns):
Patient_ID    463 non-null int64
Age           463 non-null float64
TB            463 non-null float64
DB            463 non-null float64
AAP           463 non-null float64
SGPT          463 non-null float64
SGOT          463 non-null float64
TP            463 non-null float64
ALB           463 non-null float64
A_G           463 non-null float64
is_male       463 non-null int64
dtypes: float64(9), int64(2)
memory usage: 43.4 KB


In [223]:
# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate acc_test
acc_test = accuracy_score(y_test, y_pred)
print('Test set accuracy of bc: {:.2f}'.format(acc_test)) 

Test set accuracy of bc: 0.73


This is the same value we got above when we trained with one decision tree...hmmm. I thought this was supposed to be different? The previous tree had more parameters in it, maybe that is why? 

Yes, I do believe it is due to the min_samples_leaf parameter.

Let's look at the OOB (the training data that is never looked at) score of this

In [224]:
# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50,oob_score=True,random_state=1)

In [225]:
# Fit bc to the training set 
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate test set accuracy
acc_test = accuracy_score(y_test, y_pred)

# Evaluate OOB accuracy
acc_oob = bc.oob_score_

# Print acc_test and acc_oob
print('Test set accuracy: {:.3f}, OOB accuracy: {:.3f}'.format(acc_test, acc_oob))

Test set accuracy: 0.681, OOB accuracy: 0.674


These are roughly the same value. This shows the usefulness of using the OOB score since it does not need to be cross-validated.

## Okay, now let's see what adding a booster to our decision tree does

In [226]:
# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

In [227]:
# Fit ada to the training set
ada.fit(X_train,y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

# Import roc_auc_score
from sklearn.metrics import roc_auc_score

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))

ROC AUC score: 0.64


Tune Hyperparameters of decision tree (should go above probably)

In [228]:
# Define params_dt
params_dt = {'max_depth':[2,3,4],
    'min_samples_leaf':[0.12,0.14,0.16,0.18]
}
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Instantiate grid_dt
grid_dt = GridSearchCV(estimator=dt,
                       param_grid=params_dt,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)

grid_dt.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=2,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=1,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_leaf': [0.12, 0.14, 0.16, 0.18]},
             pre_dispat

In [229]:
# Import roc_auc_score from sklearn.metrics
from sklearn.metrics import roc_auc_score

# Extract the best estimator
best_model = grid_dt.best_estimator_

# Predict the test set probabilities of the positive class
y_pred_proba = best_model.predict_proba(X_test)[:,1]

# Compute test_roc_auc
test_roc_auc = roc_auc_score(y_test,y_pred_proba)

# Print test_roc_auc
print('Test set ROC AUC score: {:.3f}'.format(test_roc_auc))

Test set ROC AUC score: 0.731


In [230]:
print(best_model)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.14, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1, splitter='best')
