In [None]:
from pprint import pprint

import numpy as np  # noqa
import pandas as pd
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("../data/hepatitis.csv", na_values="?")

In [None]:
df.drop(["ID"], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.isna().sum()

# Data cleaning

In [None]:
df = df.apply(pd.to_numeric, errors="coerce")
df = df.dropna()

In [None]:
df.isna().sum()

# Data separation

In [None]:
X = df.drop(["target"], axis=1)
y = df["target"]

# Training

In [None]:
from typing import Literal
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, mean_absolute_error


def classifiers_metrics(
    classifier_type: Literal["svc", "gridsearch"],
    kernel_types,
    X_train,
    y_train,
    X_test,
    y_test,
    random_state=0
):
    performance_metrics = {}
    classifier = None
    param_grid = None
    for kernel in kernel_types:
        if classifier_type == "svc":
            classifier = SVC(kernel=kernel, random_state=random_state)
        elif classifier_type == "gridsearch":
            param_grid = {
                "kernel": kernel_types,
                "C": [0.01, 0.1, 1],
                "gamma": [0.01, 0.1, 1],
            }
            classifier = GridSearchCV(SVC(kernel=kernel), param_grid, cv=5)

        classifier.fit(X_train, y_train)

        def metrics(type, actual, pred):
            precision = precision_score(actual, pred, average="weighted")
            recall = recall_score(actual, pred, average="weighted")
            f1 = f1_score(actual, pred, average="weighted")
            total_precision = classifier.score(X_test, y_test)
            cv_scores = cross_val_score(classifier, X, y)
            mean_cv_score = np.mean(cv_scores)
            mae = mean_absolute_error(actual, pred)

            performance_metrics[kernel] = {
                "Data Type": type,
                "Total Precision (Accuracy)": total_precision,
                "Precision": precision,
                "Recall": recall,
                "F1-measure": f1,
                "Mean Cross-Validation Score": mean_cv_score,
                "MAE": mae,
                "model": classifier,
            }
            performance_metrics[kernel].update(
                {
                    "num_support_vectors": len(classifier.support_),
                }
            ) if classifier_type == "svc" else None


        y_test_pred = classifier.predict(X_test)
        metrics("Test", y_test, y_test_pred)
    return classifier, performance_metrics

# Train and evaluate SVC and GridSearchCV classifiers

In [290]:
from collections import defaultdict

logs = defaultdict(list)
gridsearch_logs = defaultdict(list)
classifiers = []
for ts in range(1, 4):
    test_size = ts / 10
    print("Test Size:", test_size)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    kernel_types = ["linear", "rbf", "poly", "sigmoid"]
    for clf in ["svc", "gridsearch"]:
        print("Classifier Type:", clf, end='\n')
        classifier, metrics = classifiers_metrics(
            clf, kernel_types, X_train, y_train, X_test, y_test
        )
        for kernel, metrics_dict in metrics.items():
            print("Kernel Type:", kernel)
            for metric, value in metrics_dict.items():
                # print(metric + ":", value)
                logs[test_size, clf, kernel].append({metric: value})
            print()
        if clf == "gridsearch":
            gridsearch_logs.update(
                {
                    "best_estimator_": classifier.best_estimator_,
                    "best_params_": classifier.best_params_,
                    "best_score_": classifier.best_score_,
                }
            )

KeyboardInterrupt: 

In [None]:
print("Logs:")
pprint(logs)
print('Gridsearch Best Params:')
pprint(gridsearch_logs)

In [None]:
sorted_keys = sorted(logs.keys(), key=lambda k: logs[k][1]['Total Precision (Accuracy)'], reverse=True)
top_keys = sorted_keys[:4]
top_classifiers = list(zip(top_keys, [logs[key] for key in top_keys]))
top_models_obj = {}
for key, values in top_classifiers:
    top_models_obj.update({key: v for x in values for k, v in x.items() if k == 'model'})
    print("Classifier:", key)
    print("Metrics:")
    pprint(values)
    print()