In [1]:
import numpy as np
import scipy
from scipy.linalg import cho_solve
from itertools import product
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler

arff_file = arff.loadarff('MagicTelescope.arff')
data = pd.DataFrame(arff_file[0])


In [2]:
data['labels'] = (data['class:'] == b'g').astype(int)
labels = data['labels'].to_numpy()
features = data.iloc[:,1:-2].to_numpy()

scaler = MinMaxScaler()
scaler.fit(features)
features = scaler.transform(features)

In [4]:
def KRR_global(X_train,Y_train,X_test,best_params):
    '''
    Returns the Kernel Ridge Regression predictions using a Gaussian kernel.
    '''
    lam = best_params['lambda']
    sigma = best_params['length']
       
    dist = scipy.spatial.distance_matrix(X_train, X_train)
    exponent = dist/sigma

    K = np.exp(-(exponent**2)/2)

    K += np.eye(K.shape[0])*lam

    try:
        L = scipy.linalg.cholesky(K, lower=True)
    except:
        return 'Gram Matrix is not positive definite'
    else:
        try:
            alphas = cho_solve((L,True),Y_train)
        except:
            return 'Cholesky decomposition failed, check distance matrices'
        else:
            dist2 = scipy.spatial.distance_matrix(X_train, X_test)
            exponent2 = dist2/sigma

            K_test = np.exp(-(exponent2**2)/2)

            scores = np.dot(K_test.T,alphas)
            # Apply sigmoid to get probabilities (binary classification)
            probabilities = 1 / (1 + np.exp(-scores))

            # Classify based on threshold (e.g., 0.5)
            predictions = (probabilities > 0.5).astype(int)
            return predictions

def vector_std(X):
    X_mean = np.mean(X, axis=0)
    X_var = np.mean((X - X_mean) ** 2, axis=0)
    return np.sqrt(np.sum(X_var))

def GridSearchCV(X, Y, params, cv=4):
    kf = KFold(n_splits=cv)
    X_train, X_test = [], []
    Y_train, Y_test = [], []
    for train, test in kf.split(X):
        X_train.append(X[train])
        Y_train.append(Y[train])
        X_test.append(X[test])
        Y_test.append(Y[test])

    best_score = -np.inf  # Maximizing accuracy
    for lam, sigma in product(params['lambda'], params['length']):
        print(lam, sigma)
        accuracies = []
        for k in range(cv):
            preds = KRR_global(X_train[k], Y_train[k], X_test[k], {'lambda': lam, 'length': sigma})
            if type(preds) == str:
                accuracy = 0
            else:
                accuracy = np.mean(preds == Y_test[k])
                print(f"accuracy: {accuracy}")
            accuracies.append(accuracy)
        mean_accuracy = np.mean(accuracies)
        if mean_accuracy > best_score:
            best_score = mean_accuracy
            best_lambda = lam
            best_sigma = sigma
    
    return {'accuracy': best_score, 'lambda': best_lambda, 'length': best_sigma}

X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.2)

initial_sigma_guess = vector_std(X_train)
param_grid = {'length': [initial_sigma_guess*10**i for i in range(-2,3)],
              'lambda': [10**i for i in [-14, -10, -6]]}

best_params = GridSearchCV(X_train, Y_train, param_grid)

1e-14 0.004835921817014586
accuracy: 0.39931650893796006
accuracy: 0.4195583596214511
accuracy: 0.4116719242902208
accuracy: 0.39668769716088326
1e-14 0.04835921817014586
accuracy: 0.7342271293375394
accuracy: 0.7234490010515248
accuracy: 0.7284437434279706
accuracy: 0.7386961093585699
1e-14 0.48359218170145857


  probabilities = 1 / (1 + np.exp(-scores))


accuracy: 0.6611461619348055
accuracy: 0.6619348054679285
accuracy: 0.655099894847529
accuracy: 0.6529968454258676
1e-14 4.835921817014586
1e-14 48.35921817014586
1e-10 0.004835921817014586
accuracy: 0.39931650893796006
accuracy: 0.4195583596214511
accuracy: 0.4116719242902208
accuracy: 0.39668769716088326
1e-10 0.04835921817014586
accuracy: 0.7334384858044164
accuracy: 0.7226603575184016
accuracy: 0.7268664563617245
accuracy: 0.7381703470031545
1e-10 0.48359218170145857
accuracy: 0.7384332281808622
accuracy: 0.729758149316509
accuracy: 0.723186119873817
accuracy: 0.7292323869610936
1e-10 4.835921817014586
accuracy: 0.7365930599369085
accuracy: 0.7179284963196635
accuracy: 0.7281808622502629
accuracy: 0.7260778128286015
1e-10 48.35921817014586
accuracy: 0.7145110410094637
accuracy: 0.7047844374342797
accuracy: 0.7042586750788643
accuracy: 0.711093585699264
1e-06 0.004835921817014586
accuracy: 0.39931650893796006
accuracy: 0.4195583596214511
accuracy: 0.4116719242902208
accuracy: 0.3966

In [5]:
best_params

{'accuracy': 0.7495399579390116,
 'lambda': 1e-06,
 'length': 0.48359218170145857}

In [44]:
preds = KRR_global(X_train, Y_train, X_test, best_params)
accuracy = np.mean(preds == Y_test)
accuracy

np.float64(0.6819137749737119)