In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/indian_liver_patient_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,1,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,0,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,0,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,0,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,0,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [4]:
x = df.drop('Dataset', axis=1)
y = df['Dataset']

In [82]:
# splitting & scalling

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### model selection

In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, f1_score, recall_score
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

#### logistic regression

In [106]:
# using unscaled data
model_lr = LogisticRegression(C=0.1, penalty='l1')

model_lr.fit(X_train, y_train)
print('Logistic Regression Classifier on unscaled test data:')
print('Accuracy:', model_lr.score(X_test, y_test))
print('Precision:', precision_score(y_test, model_lr.predict(X_test)))
print('Recall:', recall_score(y_test, model_lr.predict(X_test)))
print('F-1 score:', f1_score(y_test, model_lr.predict(X_test)))

Logistic Regression Classifier on unscaled test data:
Accuracy: 0.7280701754385965
Precision: 0.75
Recall: 0.9529411764705882
F-1 score: 0.8393782383419688


In [107]:
kfold = KFold(n_splits=5,random_state=42)
logmodel = LogisticRegression(C=0.1, penalty='l1')
results = cross_val_score(logmodel, X_train,y_train,cv=kfold)
print(results)
print('Accuracy: ',results.mean()*100)

[0.69565217 0.7032967  0.71428571 0.67032967 0.73626374]
Accuracy:  70.39655996177736


In [108]:
# using scaled data
model_lr_scaled = LogisticRegression(C=0.1, penalty='l1')

model_lr_scaled.fit(X_train_scaled, y_train)
print('Logistic Regression Classifier on scaled test data:')
print('Accuracy:', model_lr_scaled.score(X_test_scaled, y_test))
print('Precision:', precision_score(y_test, model_lr_scaled.predict(X_test_scaled)))
print('Recall:', recall_score(y_test, model_lr_scaled.predict(X_test_scaled)))
print('F-1 score:', f1_score(y_test, model_lr_scaled.predict(X_test_scaled)))

Logistic Regression Classifier on scaled test data:
Accuracy: 0.7456140350877193
Precision: 0.7456140350877193
Recall: 1.0
F-1 score: 0.8542713567839196


In [99]:
kfold = KFold(n_splits=5,random_state=42)
logmodel = LogisticRegression(C=0.1, penalty='l1')
results = cross_val_score(logmodel, X_train_scaled, y_train,cv=kfold)
print(results)
print('Accuracy: ',results.mean()*100)

[0.75       0.68131868 0.67032967 0.71428571 0.7032967 ]
Accuracy:  70.38461538461539


#### random forest classifier

In [85]:
# using unscaled data
rfc = RandomForestClassifier(n_estimators = 20)

rfc.fit(X_train, y_train)
print('Random Forest Classifier on unscaled test data:')
print('Accuracy:', rfc.score(X_test, y_test))
print('Precision:', precision_score(y_test, rfc.predict(X_test)))
print('Recall:', recall_score(y_test, rfc.predict(X_test)))
print('F-1 score:', f1_score(y_test, rfc.predict(X_test)))

Random Forest Classifier on unscaled test data:
Accuracy: 0.7105263157894737
Precision: 0.8095238095238095
Recall: 0.8
F-1 score: 0.8047337278106509


In [100]:
kfold = KFold(n_splits=5,random_state=42)
rfc_ = RandomForestClassifier(n_estimators = 20)
results = cross_val_score(rfc_, X_train,y_train,cv=kfold)
print(results)
print('Accuracy: ',results.mean()*100)

[0.70652174 0.69230769 0.74725275 0.7032967  0.72527473]
Accuracy:  71.49307214524606


In [86]:
# using scaled data
rfc_scaled = RandomForestClassifier(n_estimators = 20)

rfc_scaled.fit(X_train_scaled, y_train)
print('Random Forest Classifier on scaled test data:')
print('Accuracy:', rfc_scaled.score(X_test_scaled, y_test))
print('Precision:', precision_score(y_test, rfc_scaled.predict(X_test_scaled)))
print('Recall:', recall_score(y_test, rfc_scaled.predict(X_test_scaled)))
print('F-1 score:', f1_score(y_test, rfc_scaled.predict(X_test_scaled)))

Random Forest Classifier on scaled test data:
Accuracy: 0.6842105263157895
Precision: 0.8024691358024691
Recall: 0.7647058823529411
F-1 score: 0.7831325301204819


In [101]:
kfold = KFold(n_splits=5,random_state=42)
rfc_ = RandomForestClassifier(n_estimators = 20)
results = cross_val_score(rfc_, X_train_scaled, y_train,cv=kfold)
print(results)
print('Accuracy: ',results.mean()*100)

[0.7173913  0.73626374 0.72527473 0.71428571 0.68131868]
Accuracy:  71.49068322981368


#### KNN

In [87]:
# using unscaled data
knn = KNeighborsClassifier(n_neighbors = 5)

knn.fit(X_train, y_train)
print('k-NN Classifier on unscaled test data:')
print('Accuracy:', knn.score(X_test, y_test))
print('Precision:', precision_score(y_test, knn.predict(X_test)))
print('Recall:', recall_score(y_test, knn.predict(X_test)))
print('F-1 score:', f1_score(y_test, knn.predict(X_test)))

k-NN Classifier on unscaled test data:
Accuracy: 0.6754385964912281
Precision: 0.7926829268292683
Recall: 0.7647058823529411
F-1 score: 0.7784431137724551


In [104]:
kfold = KFold(n_splits=5,random_state=42)
knn_ = KNeighborsClassifier(n_neighbors = 5)
results = cross_val_score(knn_, X_train,y_train,cv=kfold)
print(results)
print('Accuracy: ',results.mean()*100)

[0.70652174 0.71428571 0.68131868 0.68131868 0.71428571]
Accuracy:  69.95461060678453


In [88]:
# using scaled data
knn_scaled = KNeighborsClassifier(n_neighbors = 5)

knn_scaled.fit(X_train_scaled, y_train)
print('k-NN Classifier on scaled test data:')
print('Accuracy:', knn_scaled.score(X_test_scaled, y_test))
print('Precision:', precision_score(y_test, knn_scaled.predict(X_test_scaled)))
print('Recall:', recall_score(y_test, knn_scaled.predict(X_test_scaled)))
print('F-1 score:', f1_score(y_test, knn_scaled.predict(X_test_scaled)))

k-NN Classifier on scaled test data:
Accuracy: 0.6666666666666666
Precision: 0.7701149425287356
Recall: 0.788235294117647
F-1 score: 0.7790697674418604


In [103]:
kfold = KFold(n_splits=5,random_state=42)
knn_ = KNeighborsClassifier(n_neighbors = 5)
results = cross_val_score(knn_, X_train_scaled, y_train,cv=kfold)
print(results)
print('Accuracy: ',results.mean()*100)

[0.66304348 0.64835165 0.64835165 0.67032967 0.65934066]
Accuracy:  65.78834209268992


#### comparing model

In [109]:
a = [['model_lr',model_lr.score(X_test, y_test),precision_score(y_test, model_lr.predict(X_test)), recall_score(y_test, model_lr.predict(X_test)),f1_score(y_test, model_lr.predict(X_test))], ['model_lr_scaled',model_lr_scaled.score(X_test_scaled, y_test),precision_score(y_test, model_lr_scaled.predict(X_test_scaled)),recall_score(y_test, model_lr_scaled.predict(X_test_scaled)),f1_score(y_test, model_lr_scaled.predict(X_test_scaled))], ['rfc', rfc.score(X_test, y_test),precision_score(y_test, rfc.predict(X_test)),recall_score(y_test, rfc.predict(X_test)),f1_score(y_test, rfc.predict(X_test))], ['rfc_scaled',rfc_scaled.score(X_test_scaled, y_test), precision_score(y_test, rfc_scaled.predict(X_test_scaled)),recall_score(y_test, rfc_scaled.predict(X_test_scaled)), f1_score(y_test, rfc_scaled.predict(X_test_scaled)) ], ['knn', knn.score(X_test, y_test), precision_score(y_test, knn.predict(X_test)),recall_score(y_test, knn.predict(X_test)), f1_score(y_test, knn.predict(X_test))], ['knn_scaled',knn_scaled.score(X_test_scaled, y_test), precision_score(y_test, knn_scaled.predict(X_test_scaled)),recall_score(y_test, knn_scaled.predict(X_test_scaled)), f1_score(y_test, knn_scaled.predict(X_test_scaled))]]

dfcompare = pd.DataFrame(a, columns=['model', 'Accuracy','Precision','Recall','F-1 score'])
dfcompare


Unnamed: 0,model,Accuracy,Precision,Recall,F-1 score
0,model_lr,0.72807,0.75,0.952941,0.839378
1,model_lr_scaled,0.745614,0.745614,1.0,0.854271
2,rfc,0.710526,0.809524,0.8,0.804734
3,rfc_scaled,0.684211,0.802469,0.764706,0.783133
4,knn,0.675439,0.792683,0.764706,0.778443
5,knn_scaled,0.666667,0.770115,0.788235,0.77907


Dari pengujian tersebut saya memutuskan untuk memilih menggunakan Logistic Regression dengan scaled data. Berikutnya adalah untuk mencari best param dari model ini menggunakan GridSearch

In [152]:
logistic = LogisticRegression()
hyperparameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                   'max_iter':[100,500,1000]}

clf = GridSearchCV(logistic, hyperparameters, cv=5)
    
best_clf = clf.fit(X_train_scaled, y_train)

In [154]:
print('Best Penalty:', best_clf.best_estimator_.get_params()['penalty'])
print('Best C:', best_clf.best_estimator_.get_params()['C'])
print('Best Max_iter:', best_clf.best_estimator_.get_params()['max_iter'])

Best Penalty: l2
Best C: 100
Best Max_iter: 100


In [155]:
# using scaled data
model_lr_scaled = LogisticRegression(C=100, penalty='l2', max_iter=100)

model_lr_scaled.fit(X_train_scaled, y_train)
print('Logistic Regression Classifier on scaled test data:')
print('Accuracy:', model_lr_scaled.score(X_test_scaled, y_test))
print('Precision:', precision_score(y_test, model_lr_scaled.predict(X_test_scaled)))
print('Recall:', recall_score(y_test, model_lr_scaled.predict(X_test_scaled)))
print('F-1 score:', f1_score(y_test, model_lr_scaled.predict(X_test_scaled)))

Logistic Regression Classifier on scaled test data:
Accuracy: 0.7543859649122807
Precision: 0.7878787878787878
Recall: 0.9176470588235294
F-1 score: 0.8478260869565216


In [156]:
best_clf.predict(x)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [157]:
import pickle

pickle.dump(model_lr_scaled, open('model_ilp', 'wb'))

In [158]:
loadModel = pickle.load(open('model_ilp', 'rb'))

In [159]:
loadModel

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)