In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree
from utilities import *
from random import sample
import scipy
import warnings
warnings.filterwarnings('ignore')

In [2]:
def l_cal_sample(X, y, ell):
    index_pos = [x for x, z in enumerate(y) if z == 1] 
    index_neg = [x for x, z in enumerate(y) if z == -1]
    index_l_cal = sample(index_pos, ell) + sample(index_neg, ell)
    return index_l_cal

def adjacency_weight_matrix(X, y):
    m = X.shape[0]
    W = np.zeros((m,m))
    tree = KDTree(X)
    for i in range(m):
        neighbors = tree.query([X[i,:]],k=4)[1][0][1:]
        for j in neighbors:
            W[i,j] = 1
            W[j,i] = 1
    return W

def degree_matrix(adjacency_weight_matrix):
    diag_values = np.sum(adjacency_weight_matrix, axis=1)
    return np.diag(diag_values)

def laplacian_matrix(D, W):
    return D - W

def solver_LI(D, W, y):
    return np.linalg.inv(D) @ W @ np.reshape(y, (y.shape[0],1))

def kernel_matrix(L, l_cal):
    L_pinv = np.linalg.pinv(L)
    index_matrix = np.zeros(L_pinv.shape)

    for i in l_cal:
        for j in l_cal:
            index_matrix[i,j] = 1
    # print(index_matrix)
    full_matrix = np.multiply(L_pinv, index_matrix)
    K_tmp = [] # len(K) = m
    for i in range(full_matrix.shape[0]):
        K_tmp.append(full_matrix[i, full_matrix[i,:] != 0])

    K = [ele for ele in K_tmp if ele.size > 0]
    return np.asarray(K)

def solver_LKI(L, l_cal, y):
    K = kernel_matrix(L, l_cal)
    y_l_cal = np.reshape(np.asarray(y[l_cal]), (len(l_cal), 1))
    alpha_star = np.linalg.pinv(K) @ y_l_cal
    alpha_star_full = np.asarray([0 if i not in l_cal else alpha_star[np.where(np.asarray(l_cal) == i)]  for i in range(len(y))])

    L_pinv = np.linalg.pinv(L)
    e_matrix = np.zeros((len(y), len(y)))
    [e_matrix[i,i] == 1 for i in range(len(y)) if i in l_cal]
    for i in range(len(y)):
        if i in l_cal:
            e_matrix[i,i] = 1
            
    V_matrix = np.reshape(alpha_star_full, (1, alpha_star_full.size)) @ e_matrix @ L_pinv
    V_vector = np.sum(V_matrix, 0)

    return V_vector

def sign(X):
    # assert type(x) == 'float'
    res = []
    for x in X:
        if x >= 0:
            res.append(1)
        elif x < 0:
            res.append(-1)
    return np.asarray(res)

def empirical_gene_error(y, y_hat, l_cal):
    assert len(y) == len(y_hat)
    num_all = len(y) - len(l_cal)
    error = y - y_hat
    for i in l_cal:
        error[i] = 0
    num_error = np.count_nonzero(error)
    return num_error/num_all

def summary(summary_dict:dict, ell_set) -> pd.DataFrame:
    df = pd.DataFrame(columns=ell_set)
    for file, value in summary_dict.items():
        row = []
        for _, error_list in value.items():
            row.append(str(round(np.mean(error_list), 4)) + ' +_ ' + str(round(np.std(error_list), 4)))
        df.loc[file[14:-4]] = row
    return df

In [3]:
error_summary_LI, error_summary_LKI = {}, {}
for dataset_path in ['data/dtrain13_50.dat', 'data/dtrain13_100.dat', 'data/dtrain13_200.dat', 'data/dtrain13_400.dat']:
    X, y = data_reader(dataset_path)
    y = y - 2
    error_file_LI, error_file_LKI = {}, {}
    ell_set = [1, 2, 4, 8, 16]
    for ell in ell_set:
        error_ell_LI, error_ell_LKI = [], []
        for _ in range(20):
            l_cal = l_cal_sample(X, y, ell)
            W = adjacency_weight_matrix(X,y)
            D = degree_matrix(W)
            L = laplacian_matrix(D, W)
            # LI
            V_LI = solver_LI(D, W, y)
            # LKI
            V_LKI = solver_LKI(L,l_cal, y)
            
            error_LI = empirical_gene_error(y, sign(V_LI), l_cal)
            error_LKI = empirical_gene_error(y, sign(V_LKI), l_cal)

            error_ell_LI.append(error_LI)
            error_ell_LKI.append(error_LKI)

        error_file_LI[ell] = error_ell_LI
        error_file_LKI[ell] = error_ell_LKI
    
    error_summary_LI[dataset_path] = error_file_LI
    error_summary_LKI[dataset_path] = error_file_LKI

df1 = summary(error_summary_LI, ell_set)
df2 = summary(error_summary_LKI, ell_set)
            
            

In [4]:
df1

Unnamed: 0,1,2,4,8,16
50,0.0301 +_ 0.0022,0.0312 +_ 0.0,0.0299 +_ 0.0047,0.0304 +_ 0.0059,0.0287 +_ 0.0098
100,0.0197 +_ 0.0015,0.0191 +_ 0.0022,0.0201 +_ 0.0019,0.0209 +_ 0.0019,0.0205 +_ 0.0044
200,0.0074 +_ 0.0005,0.0074 +_ 0.0006,0.0074 +_ 0.0008,0.0074 +_ 0.0009,0.0077 +_ 0.001
400,0.0063 +_ 0.0,0.0063 +_ 0.0,0.0062 +_ 0.0003,0.0063 +_ 0.0003,0.0063 +_ 0.0005


In [5]:
df2

Unnamed: 0,1,2,4,8,16
50,0.1515 +_ 0.0924,0.0719 +_ 0.0422,0.0777 +_ 0.0589,0.0643 +_ 0.0329,0.05 +_ 0.022
100,0.0952 +_ 0.1918,0.0434 +_ 0.0215,0.0409 +_ 0.0047,0.0375 +_ 0.0069,0.0369 +_ 0.0111
200,0.0313 +_ 0.0341,0.0176 +_ 0.0059,0.0184 +_ 0.0033,0.0156 +_ 0.0033,0.0166 +_ 0.0042
400,0.0305 +_ 0.0753,0.012 +_ 0.0025,0.0125 +_ 0.0024,0.0119 +_ 0.0018,0.0114 +_ 0.002
