# Imports

In [22]:
from scipy.io import loadmat
import numpy as np
import kernels
import gpflow
from sklearn.model_selection import train_test_split
import json

# Load in data

In [None]:
full_data = loadmat("FeGaPd_full_data_220104a.mat")

# 'true', human-assigned labels
full_true_labels = full_data["labels_col"][0][1].flatten() - 1

# use either the true labels or the engineered labels
# true_labels = full_true_labels[kernels.DATA_INDICES]
true_labels = np.array(
    [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0]
)

# Data Generation

In [24]:
FOLDER = "./FMI_results/gp_comparison/"
NUM_SAMPLES = 20
FILE_SUFFIX = "quantum" # kernel suffix for output json

In [None]:
# define complete gpflow model
kernel = kernels.FixedPrecomputedGPKernel(
    kernels.get_measured_quantum_kernel_matrix(), # select from kernels.py
)

# invlink = gpflow.likelihoods.RobustMax(2)
# likelihood = gpflow.likelihoods.MultiClass(num_classes=2, invlink=invlink)


def get_GP_labels(train_indices, train_labels, x_test):
    model = gpflow.models.VGP(
        data=(train_indices, train_labels),
        kernel=kernel,
        likelihood=gpflow.likelihoods.Bernoulli(),
        # likelihood=likelihood,
        # num_latent_gps=2,
    )

    # fit the GP
    opt = gpflow.optimizers.Scipy()
    _ = opt.minimize(
        closure=model.training_loss,
        variables=model.trainable_variables,  # type: ignore
        compile=False,
        # method="TNC",
        # options={"maxiter": 1000},
    )

    y_mean, _ = model.predict_y(x_test)

    # return np.argmax(y_mean.numpy(), axis=1)
    return np.round(y_mean.numpy().flatten()).astype(int)  # type: ignore

In [26]:
for num_training_points in range(5, 20):
    all_test_points = []
    all_predicted_labels = []

    for sample_number in range(NUM_SAMPLES):
        if num_training_points == 19:
            # if all by 1 training points are missing, just enumerate options
            x_test = np.array([sample_number])
            x_train = np.delete(np.arange(20), x_test[0])

            y_train = true_labels[x_train]
            y_test = true_labels[x_test]
        else:
            # otherwise randomly choose a test / train split
            split = train_test_split(
                np.arange(20),
                true_labels,
                train_size=num_training_points,
                # stratify=true_labels,
            )

            x_train, x_test, y_train, y_test = split

        # get GP performance
        gp_labels = get_GP_labels(
            *[arg.reshape((-1, 1)).astype(float) for arg in [x_train, y_train, x_test]],
        )

        # json doesn't like int32 for some reason, so convert to 'regular' int
        all_test_points.append([int(i) for i in x_test])
        all_predicted_labels.append([int(i) for i in gp_labels])

    with open(f"{FOLDER}{num_training_points}_points_{FILE_SUFFIX}.json", "w") as f:
        json.dump(
            {"test_points": all_test_points, "predicted_labels": all_predicted_labels},
            f,
        )