# Training RBF and classifying the binary Iris dataset
For visualization purposes in the paper

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader

from ohqk.data import LabelledDataset
from ohqk.kta_classical import KernelTargetAlignmentLoss, rbf_kernel
from ohqk.project_directories import GRAPHICS_DIR
from ohqk.train import train
from ohqk.utils import relabel_to_m1p1, running_average_filter

plt.rcParams["text.usetex"] = True
# make the y axis invisible
plt.rcParams["ytick.left"] = False
plt.rcParams["ytick.labelleft"] = True

## Load and rescale the data
Note that here the kernel training is done on the full dataset and the classifier selection/training/testing splits the data first. For the paper results the split happens already before kernel training (which is also good ML practice).

In [None]:
np.random.seed(42)
torch.manual_seed(42)

X, y = load_iris(return_X_y=True)
X = X[y < 2]
y = relabel_to_m1p1(y[y < 2])

scaler = StandardScaler()
X = scaler.fit_transform(X)

ds = LabelledDataset(X, y)

In [None]:
num_epochs = 100
num_checkpoints = 50
batch_size = 50
lr = 1e-1
gamma = 10 * torch.rand(1)  # initial gamma
gamma.requires_grad = True

dl = DataLoader(ds, batch_size=batch_size, shuffle=True)
opt = torch.optim.Adam([gamma], lr)
loss_function = KernelTargetAlignmentLoss(rbf_kernel)

## Kernel training

In [None]:
print("initial gamma:", gamma.item())
# train the model
trained_gamma, losses = train(
    gamma,
    loss_function,
    opt,
    num_epochs,
    dl,
    num_checkpoints=num_checkpoints,
)
print("trained gamma:", trained_gamma.item())

In [None]:
smooth_losses = running_average_filter(losses, factor=0.6)
plt.plot([-s for s in smooth_losses])  # negative sign for kta
plt.xlabel("epoch")
plt.ylabel("KTA")
plt.savefig(GRAPHICS_DIR / "rbf_kta_opt_iris.pdf")

## Model selection

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
param_grid = {"C": [0.1, 1, 10, 100], }
svc = SVC(kernel="rbf", gamma=trained_gamma.item())

grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=3, n_jobs=-1, scoring="accuracy")
grid_search.fit(X_train, y_train)

print("best C", grid_search.best_params_)

## Model testing

In [None]:
from ohqk.model_testing import produce_clf_learning_curve

In [None]:
train_fractions = np.linspace(0.1, 1, 10)
_, test_scores = produce_clf_learning_curve(
    grid_search.best_estimator_, X_train, X_test, y_train, y_test, train_fractions=train_fractions,
)

In [None]:
sns.barplot(x=train_fractions, y=test_scores)
plt.xticks(ticks=np.arange(0, 10, 2), labels=[f"{t:.0%}" for t in train_fractions[::2]])
plt.xlabel("train fraction")
plt.ylabel("test score")
plt.savefig(GRAPHICS_DIR / "rbf_kta_opt_iris_learning_curve.pdf")