In [16]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import RocCurveDisplay
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("preprocesseddata.csv",sep=',')
data = data.drop(["Unnamed: 0"],axis=1)
data

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0,32,1,38.5,52.500000,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0,32,1,38.5,70.300000,18.0,24.7,3.9,11.17,4.80,74.0,15.6,76.5
2,0,32,1,46.9,74.700000,36.2,52.6,6.1,8.84,5.20,86.0,33.2,79.3
3,0,32,1,43.2,52.000000,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0,32,1,39.2,74.100000,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
596,3,62,0,32.0,416.600000,5.9,110.3,50.0,5.57,6.30,55.7,650.9,68.5
597,3,64,0,24.0,102.800000,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3
598,3,64,0,29.0,87.300000,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0
599,3,46,0,33.0,93.220833,39.0,62.0,20.0,3.56,4.20,52.0,50.0,71.0


In [4]:
y = data['Category']
X = data.drop(['Category'],axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, stratify=y, random_state=42)

In [6]:
# normalization
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.transform(X_test)

array([[-2.46637843,  0.79028396,  0.98373106, ..., -0.0584446 ,
         1.00056649, -0.41674888],
       [ 0.5185273 , -1.26536796, -0.48396611, ...,  1.32826723,
         2.00051881,  2.10468503],
       [ 0.5185273 , -1.26536796,  1.80931072, ...,  0.09446279,
        -0.42163057,  2.00462813],
       ...,
       [ 0.41559952,  0.79028396,  0.85530756, ..., -0.02856615,
        -0.49840234,  0.34368357],
       [ 0.10681617, -1.26536796,  0.37830598, ..., -0.3273507 ,
        -0.4772901 ,  0.18359253],
       [-1.02538945,  0.79028396, -1.41962306, ...,  0.21749172,
        -0.15292937,  0.50377461]])

In [10]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb.score(X_test,y_test)

0.9602649006622517

In [12]:
y_pred = gnb.predict(X_test)
print(classification_report(y_test,y_pred))
print(balanced_accuracy_score(y_test,y_pred))
print("----------")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       132
           1       0.50      0.33      0.40         6
           2       0.67      0.80      0.73         5
           3       0.89      1.00      0.94         8

    accuracy                           0.96       151
   macro avg       0.76      0.78      0.77       151
weighted avg       0.96      0.96      0.96       151

0.781439393939394
----------
[[131   1   0   0]
 [  1   2   2   1]
 [  0   1   4   0]
 [  0   0   0   8]]


In [13]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test,y_test)

0.9536423841059603

In [14]:
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))
print(balanced_accuracy_score(y_test,y_pred))
print("----------")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       132
           1       1.00      0.33      0.50         6
           2       0.43      0.60      0.50         5
           3       0.88      0.88      0.88         8

    accuracy                           0.95       151
   macro avg       0.82      0.70      0.72       151
weighted avg       0.96      0.95      0.95       151

0.7020833333333333
----------
[[132   0   0   0]
 [  0   2   3   1]
 [  2   0   3   0]
 [  0   0   1   7]]


In [17]:
svc = svm.SVC()
parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':range(1,10)}
svc = GridSearchCV(svc, parameters)
svc.fit(X_train, y_train)
svc.score(X_test,y_test)

0.9668874172185431

In [19]:
svc.best_params_

{'C': 4, 'kernel': 'poly'}

In [20]:
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))
print(balanced_accuracy_score(y_test,y_pred))
print("----------")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       132
           1       1.00      0.33      0.50         6
           2       0.43      0.60      0.50         5
           3       0.88      0.88      0.88         8

    accuracy                           0.95       151
   macro avg       0.82      0.70      0.72       151
weighted avg       0.96      0.95      0.95       151

0.7020833333333333
----------
[[132   0   0   0]
 [  0   2   3   1]
 [  2   0   3   0]
 [  0   0   1   7]]
