In [2]:
import numpy as np
import scipy
from scipy.linalg import cho_solve, cho_factor, eigh
from itertools import product
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from scipy.linalg import LinAlgError
arff_file = arff.loadarff('dataset/MagicTelescope.arff')
data = pd.DataFrame(arff_file[0])


In [3]:
data['labels'] = (data['class:'] == b'g').astype(int)
labels = data['labels'].to_numpy()
features = data.iloc[:,1:-2].to_numpy()

scaler = MinMaxScaler()
scaler.fit(features)
features = scaler.transform(features)

In [4]:
def KRR_global(X_train,Y_train,X_test,best_params):
    '''
    Returns the Kernel Ridge Regression predictions using a Gaussian kernel.
    '''
    lam = best_params['lambda']
    sigma = best_params['length']
       
    dist = scipy.spatial.distance_matrix(X_train, X_train)
    exponent = dist/sigma

    K = np.exp(-(exponent**2)/2)

    K += np.eye(K.shape[0])*lam

    try:
        L = scipy.linalg.cholesky(K, lower=True)
    except:
        return 'Gram Matrix is not positive definite'
    else:
        try:
            alphas = cho_solve((L,True),Y_train)
        except:
            return 'Cholesky decomposition failed, check distance matrices'
        else:
            dist2 = scipy.spatial.distance_matrix(X_train, X_test)
            exponent2 = dist2/sigma

            K_test = np.exp(-(exponent2**2)/2)

            scores = np.dot(K_test.T,alphas)
            # Apply sigmoid to get probabilities (binary classification)
            probabilities = 1 / (1 + np.exp(-scores))

            # Classify based on threshold (e.g., 0.5)
            predictions = (probabilities > 0.5).astype(int)
            return predictions


def vector_std(X):
    X_mean = np.mean(X, axis=0)
    X_var = np.mean((X - X_mean) ** 2, axis=0)
    return np.sqrt(np.sum(X_var))

def GridSearchCV(X, Y, params, cv=4):
    kf = KFold(n_splits=cv)
    X_train, X_test = [], []
    Y_train, Y_test = [], []
    for train, test in kf.split(X):
        X_train.append(X[train])
        Y_train.append(Y[train])
        X_test.append(X[test])
        Y_test.append(Y[test])

    best_score = -np.inf  # Maximizing accuracy
    for lam, sigma in product(params['lambda'], params['length']):
        print(lam, sigma)
        accuracies = []
        for k in range(cv):
            preds = KRR_global(X_train[k], Y_train[k], X_test[k], {'lambda': lam, 'length': sigma})
            if type(preds) == str:
                accuracy = 0
            else:
                accuracy = np.mean(preds == Y_test[k])
            accuracies.append(accuracy)
        mean_accuracy = np.mean(accuracies)
        if mean_accuracy > best_score:
            best_score = mean_accuracy
            best_lambda = lam
            best_sigma = sigma
    
    return {'accuracy': best_score, 'lambda': best_lambda, 'length': best_sigma}

X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.2)
print(X_train.shape)
# initial_sigma_guess = vector_std(X_train)
# param_grid = {'length': [initial_sigma_guess*10**i for i in range(-2,3)],
#               'lambda': [10**i for i in [-14, -10, -6]]}

# best_params = GridSearchCV(X_train, Y_train, param_grid)

(15216, 10)


In [6]:
best_params = {
    "lambda" : 10**-2,
    "length" : vector_std(X_train)**-2
}
print(f"best_params:\n{best_params}")
preds = KRR_global(X_train, Y_train, X_test, best_params)
# print(f"preds: {preds}")
accuracy = np.mean(preds == Y_test)
print(f'accuracy: {accuracy}')

best_params:
{'lambda': 0.01, 'length': 4.290673174692527}
chol on K


LinAlgError: 1318-th leading minor of the array is not positive definite

In [4]:
# best_params

In [5]:
# preds = KRR_global(X_train, Y_train, X_test, best_params)
# accuracy = np.mean(preds == Y_test)
# accuracy

In [7]:
x = [[0,0],[1,0],[2,0]]
y = [[0,0],[0,0],[0,0]]

dist = scipy.spatial.distance_matrix(x,x)

In [None]:
print(dist)