In [301]:
from pprint import pprint

import numpy as np  # noqa
import pandas as pd
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split

In [302]:
df = pd.read_csv("../data/hepatitis.csv", na_values="?")

In [303]:
df.drop(["ID"], axis=1, inplace=True)

In [304]:
df.head()

Unnamed: 0,target,age,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,liverFirm,spleen,spiders,ascites,varices,bili,alk,sgot,albu,protime,histology
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,2,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,2,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [305]:
df.isna().sum()

target         0
age            0
gender         0
steroid        1
antivirals     0
fatigue        1
malaise        1
anorexia       1
liverBig      10
liverFirm     11
spleen         5
spiders        5
ascites        5
varices        5
bili           6
alk           29
sgot           4
albu          16
protime       67
histology      0
dtype: int64

# Data cleaning

In [306]:
df = df.apply(pd.to_numeric, errors="coerce")
df = df.dropna()

In [307]:
df.isna().sum()

target        0
age           0
gender        0
steroid       0
antivirals    0
fatigue       0
malaise       0
anorexia      0
liverBig      0
liverFirm     0
spleen        0
spiders       0
ascites       0
varices       0
bili          0
alk           0
sgot          0
albu          0
protime       0
histology     0
dtype: int64

# Data separation

In [308]:
X = df.drop(["target"], axis=1)
y = df["target"]

# Training

In [309]:
from typing import Literal
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, mean_absolute_error


def classifiers_metrics(
    classifier_type: Literal["svc", "gridsearch"],
    kernel_types,
    X_train,
    y_train,
    X_test,
    y_test,
    random_state=0
):
    performance_metrics = {}
    classifier = None
    param_grid = None
    for kernel in kernel_types:
        if classifier_type == "svc":
            classifier = SVC(kernel=kernel, random_state=random_state)
        elif classifier_type == "gridsearch":
            param_grid = {
                "kernel": kernel_types,
                "C": [0.01, 0.1, 1],
                "gamma": [0.01, 0.1, 1],
            }
            classifier = GridSearchCV(SVC(kernel=kernel), param_grid, cv=5)

        classifier.fit(X_train, y_train)

        def metrics(type, actual, pred):
            precision = precision_score(actual, pred, average="weighted", zero_division=0)
            recall = recall_score(actual, pred, average="weighted", zero_division=0)
            f1 = f1_score(actual, pred, average="weighted", zero_division=0)
            total_precision = classifier.score(X_test, y_test)
            cv_scores = cross_val_score(classifier, X, y)
            mean_cv_score = np.mean(cv_scores)
            mae = mean_absolute_error(actual, pred)

            performance_metrics[kernel] = {
                "Data Type": type,
                "Total Precision (Accuracy)": total_precision,
                "Precision": precision,
                "Recall": recall,
                "F1-measure": f1,
                "Mean Cross-Validation Score": mean_cv_score,
                "MAE": mae,
                "model": classifier,
            }
            performance_metrics[kernel].update(
                {
                    "num_support_vectors": len(classifier.support_),
                }
            ) if classifier_type == "svc" else None


        y_test_pred = classifier.predict(X_test)
        metrics("Test", y_test, y_test_pred)
    return classifier, performance_metrics

# Train and evaluate SVC and GridSearchCV classifiers

In [310]:
from collections import defaultdict

logs = defaultdict(list)
gridsearch_logs = defaultdict(list)
classifiers = []
for ts in range(1, 4):
    test_size = ts / 10
    print("\nTest Size:", test_size)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    kernel_types = ["linear", "rbf", "poly", "sigmoid"]
    for clf in ["svc", "gridsearch"]:
        print("\n\tClassifier Type:", clf)
        classifier, metrics = classifiers_metrics(
            clf, kernel_types, X_train, y_train, X_test, y_test
        )
        for kernel, metrics_dict in metrics.items():
            print("\t\t\tKernel Type:", kernel)
            for metric, value in metrics_dict.items():
                # print(metric + ":", value)
                logs[test_size, clf, kernel].append({metric: value})
        if clf == "gridsearch":
            gridsearch_logs.update(
                {
                    "best_estimator_": classifier.best_estimator_,
                    "best_params_": classifier.best_params_,
                    "best_score_": classifier.best_score_,
                }
            )


Test Size: 0.1

	Classifier Type: svc
			Kernel Type: linear
			Kernel Type: rbf
			Kernel Type: poly
			Kernel Type: sigmoid

	Classifier Type: gridsearch
			Kernel Type: linear
			Kernel Type: rbf
			Kernel Type: poly
			Kernel Type: sigmoid

Test Size: 0.2

	Classifier Type: svc
			Kernel Type: linear
			Kernel Type: rbf
			Kernel Type: poly
			Kernel Type: sigmoid

	Classifier Type: gridsearch
			Kernel Type: linear
			Kernel Type: rbf
			Kernel Type: poly
			Kernel Type: sigmoid

Test Size: 0.3

	Classifier Type: svc
			Kernel Type: linear
			Kernel Type: rbf
			Kernel Type: poly
			Kernel Type: sigmoid

	Classifier Type: gridsearch
			Kernel Type: linear
			Kernel Type: rbf
			Kernel Type: poly
			Kernel Type: sigmoid


In [311]:
print("Logs:")
pprint(logs)
print('Gridsearch Best Params:')
pprint(gridsearch_logs)

Logs:
defaultdict(<class 'list'>,
            {(0.1, 'gridsearch', 'linear'): [{'Data Type': 'Test'},
                                             {'Total Precision (Accuracy)': 0.875},
                                             {'Precision': 0.765625},
                                             {'Recall': 0.875},
                                             {'F1-measure': 0.8166666666666667},
                                             {'Mean Cross-Validation Score': 0.825},
                                             {'MAE': 0.125},
                                             {'model': GridSearchCV(cv=5, estimator=SVC(kernel='linear'),
             param_grid={'C': [0.01, 0.1, 1], 'gamma': [0.01, 0.1, 1],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']})}],
             (0.1, 'gridsearch', 'poly'): [{'Data Type': 'Test'},
                                           {'Total Precision (Accuracy)': 0.875},
                                           {'Precis

In [312]:
sorted_keys = sorted(logs.keys(), key=lambda k: logs[k][1]['Total Precision (Accuracy)'], reverse=True)
top_keys = sorted_keys[:4]
top_classifiers = list(zip(top_keys, [logs[key] for key in top_keys]))
top_models_obj = {}
for key, values in top_classifiers:
    top_models_obj.update({key: v for x in values for k, v in x.items() if k == 'model'})
    print("Classifier:", key)
    print("Metrics:")
    pprint(values)
    print()

Classifier: (0.1, 'svc', 'linear')
Metrics:
[{'Data Type': 'Test'},
 {'Total Precision (Accuracy)': 0.875},
 {'Precision': 0.765625},
 {'Recall': 0.875},
 {'F1-measure': 0.8166666666666667},
 {'Mean Cross-Validation Score': 0.85},
 {'MAE': 0.125},
 {'model': SVC(kernel='linear', random_state=0)},
 {'num_support_vectors': 20}]

Classifier: (0.1, 'svc', 'rbf')
Metrics:
[{'Data Type': 'Test'},
 {'Total Precision (Accuracy)': 0.875},
 {'Precision': 0.765625},
 {'Recall': 0.875},
 {'F1-measure': 0.8166666666666667},
 {'Mean Cross-Validation Score': 0.8375},
 {'MAE': 0.125},
 {'model': SVC(random_state=0)},
 {'num_support_vectors': 30}]

Classifier: (0.1, 'svc', 'poly')
Metrics:
[{'Data Type': 'Test'},
 {'Total Precision (Accuracy)': 0.875},
 {'Precision': 0.765625},
 {'Recall': 0.875},
 {'F1-measure': 0.8166666666666667},
 {'Mean Cross-Validation Score': 0.825},
 {'MAE': 0.125},
 {'model': SVC(kernel='poly', random_state=0)},
 {'num_support_vectors': 29}]

Classifier: (0.1, 'gridsearch', 'l