In [73]:
#Import Modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as pyplot
import seaborn as sns

SEED = 200

In [22]:
#Read data into pandas DataFrame
df = pd.read_csv('cirrhosis.csv')
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [23]:
#Explore data and see null values / data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             418 non-null    int64  
 1   N_Days         418 non-null    int64  
 2   Status         418 non-null    object 
 3   Drug           312 non-null    object 
 4   Age            418 non-null    int64  
 5   Sex            418 non-null    object 
 6   Ascites        312 non-null    object 
 7   Hepatomegaly   312 non-null    object 
 8   Spiders        312 non-null    object 
 9   Edema          418 non-null    object 
 10  Bilirubin      418 non-null    float64
 11  Cholesterol    284 non-null    float64
 12  Albumin        418 non-null    float64
 13  Copper         310 non-null    float64
 14  Alk_Phos       312 non-null    float64
 15  SGOT           312 non-null    float64
 16  Tryglicerides  282 non-null    float64
 17  Platelets      407 non-null    float64
 18  Prothrombi

In [24]:
#Explore null values in columns
df.isna().sum()

ID                 0
N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64

In [25]:
#Create list of columns with numerical data types
num_columns = []
for col in df.columns:
  if df[col].dtype == 'int64' or df[col].dtype == 'float64':
    num_columns.append(col)

print(num_columns)

['ID', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']


In [26]:
#Imputer missing values in numerical columns with column mean
imputer = SimpleImputer(strategy='median', missing_values=np.nan)
imputer.fit(df[num_columns])
df[num_columns] = imputer.transform(df[num_columns])
df.isna().sum()

ID                 0
N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol        0
Albumin            0
Copper             0
Alk_Phos           0
SGOT               0
Tryglicerides      0
Platelets          0
Prothrombin        0
Stage              0
dtype: int64

In [27]:
#Create list of columns with object data types
obj_columns = []
for col in df.columns:
  if df[col].dtype == 'object':
    obj_columns.append(col)

print(obj_columns)

['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']


In [28]:
#Imputer missing values in non-numerical columns with column most frequent
imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
imputer.fit(df[obj_columns])
df[obj_columns] = imputer.transform(df[obj_columns])
df.isna().sum()

ID               0
N_Days           0
Status           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64

In [29]:
#Exploring unique values of categorical columns
for col in obj_columns:
  print(col, df[col].unique())

Status ['D' 'C' 'CL']
Drug ['D-penicillamine' 'Placebo']
Sex ['F' 'M']
Ascites ['Y' 'N']
Hepatomegaly ['Y' 'N']
Spiders ['Y' 'N']
Edema ['Y' 'N' 'S']


In [30]:
#Dropping certain columns that are irrelevant/will cause data leakage
df.drop(['Status', 'N_Days', 'ID'], axis=1, inplace=True)
df.columns

Index(['Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema',
       'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT',
       'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage'],
      dtype='object')

In [31]:
#Creating dummt varaibles for the categorical columns.
df_dummy = pd.get_dummies(df, drop_first=True)
df_dummy.head()
#Our data is now ready for modelling.

Unnamed: 0,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Drug_Placebo,Sex_M,Ascites_Y,Hepatomegaly_Y,Spiders_Y,Edema_S,Edema_Y
0,21464.0,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0,0,0,1,1,1,0,1
1,20617.0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0,0,0,0,1,1,0,0
2,25594.0,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0,0,1,0,0,0,1,0
3,19994.0,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0,0,0,0,1,1,1,0
4,13918.0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0,1,0,0,1,1,0,0


In [32]:
#Only histology stage 4 is defined as cirrhosis
df_dummy['Stage'] = np.where(df_dummy['Stage'] == 4.0, 1, 0)

In [33]:
#Creating feature and target variables
X = df_dummy.drop('Stage', axis=1)
y= df_dummy['Stage']

In [34]:
#Creating train and test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [35]:
knn = KNeighborsClassifier()
gbc = GradientBoostingClassifier()
rf = RandomForestClassifier()

models = [('KNeighborsClassifier', knn), ('GradientBoostingClassifier', gbc), ('RandomForestClassifier', rf)]

In [72]:
for name, model in models:
  model.fit(X_train, y_train)
  y_pred = model.predict_proba(X_test)[:,1]
  auc_score = roc_auc_score(y_test, y_pred)
  print(name + ' - AUC score is: ' + str(auc_score))

#RandomForestClassifier has the best ROC score

KNeighborsClassifier - AUC score is: 0.4720552884615385
GradientBoostingClassifier - AUC score is: 0.7878605769230769
RandomForestClassifier - AUC score is: 0.8506610576923077


In [37]:
#RF hypertuning
#Checking adjustable parameters
rf = RandomForestClassifier()
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [64]:
#Parameter grid to hypertune selected model parameters.
param_grid = {
  'max_depth' : list(range(1,11)),
  'n_estimators' : [50,100,150,200],
  'min_samples_leaf' : list(range(1,6)),
  'min_samples_split' : list(range(2,6))
}

In [65]:
gridCV = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, scoring='roc_auc')
gridCV.fit(X_train, y_train)
y_pred = gridCV.predict_proba(X_test)

In [69]:
gridCV.best_params_

{'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 100}

In [70]:
gridCV.best_score_

0.7858965145526411

In [71]:
gridCV.best_estimator_

RandomForestClassifier(max_depth=3, min_samples_split=3)

RandomForrestClassifier was the classification model to use and with hypertuned parameters provided and AUC score of 79%.

Best Parameters:
- max_depth=3,
- min_samples_split=3