In [10]:
import numpy as np
import sklearn.utils.extmath as sm
from numpy.linalg import inv
from numpy.linalg import eig
from numpy import dot, diag
from scipy.linalg import sqrtm
from scipy.spatial.distance import euclidean
import random, math
np.random.seed(42)
from scipy.optimize import minimize
import matplotlib.pyplot as plt

# Cluster kernel

## K Matrix Paper

In [52]:
def fill_diag(M, a):
    """
    M: square matrix
    a: array of length number of rows
    ----
    fill the diagonal of M with values of array a
    """
    s = M.shape
    D = np.zeros(s)
    for i in range(s[0]):
        D[i,i] = a[i]
    return D

def rbf_kernel(X, sigma=1):
    K = np.zeros((len(X), len(X)))
    for a in range(len(X)):
        for b in range(len(X)):
            K[a, b] = rbf_function(X[a], X[b],sigma)
    return K
            
def rbf_function(x, y, sigma=1):
    exponent = - (euclidean(x, y) ** 2) / (2 * (sigma ** 2))
    return np.exp(exponent)


def diagonal_row_sum_matrix(M):
    rows_sum = M.sum(axis = 1)
    return fill_diag(M,rows_sum)

def computeL(D,K):
    Dinv = inv(D)
    return sqrtm(Dinv).dot(K).dot(sqrtm(Dinv))

def pick_eigs(eigen_vals, eigen_vect, k):
    if k > len(eigen_vals):
        k = len(eigen_vals)
    eig_vals = list(eigen_vals)
    new_eigs = []
    new_eigen_vector = []
    last_max_value = 0
    for i in range(0, k):
        argmax = eig_vals.index(max(eig_vals))
        new_eigen_vector.append(eigen_vect[argmax])
        new_eig = eig_vals.pop(argmax)
        new_eigs.append(new_eig)
        last_max_value = new_eig
        
    argmax = eig_vals.index(max(eig_vals))
    new_eig = eig_vals[argmax]
        
    while(new_eig == last_max_value):
        argmax = eig_vals.index(max(eig_vals))
        new_eigen_vector.append(eigen_vect[argmax])
        new_eig = eig_vals.pop(argmax)
        new_eigs.append(new_eig)
        
    return np.array(new_eigs), np.array(new_eigen_vector)      
    

def build_K(lambdaCut, transfer, X, n_clusters, sigma=5):
    
    #Step 1 - K matrix
    K = rbf_kernel(X, sigma)
    D = diagonal_row_sum_matrix(K)
    
    #Step 2 - L matrix
    L = computeL(D, K)
    eigen_vals, U = eig(L)
    eigen_vals, U = pick_eigs(eigen_vals, U, n_clusters)   
    
    Q = diag(eigen_vals)
    
    #Step 3 - Transfer Function
    #choosing lambdacut
    newEigen = transfer(eigen_vals, lambdaCut)
    newEigen = diag(newEigen)
    
    #Step 4 - New Kernel matrix
#     print(U.shape)26
#     print(newEigen.shape)22   6222 62 26
    newL = (U.T).dot(newEigen).dot((U))
    newD = inv(diag(diag(L)))
    newK = sqrtm(newD).dot(newL).dot(sqrtm(newD))
    return newK
    

#TRANSFER FUNCTION
def linear(vals, lambdaCut):
    return vals

def step(vals,lambdaCut):
    return [ 1 if x >= lambdaCut else 0 for x in vals ]

def linear_step(vals, lambdaCut):
    return [ x if x >= lambdaCut else 0 for x in vals ]

def polynomial(vals, exponent):
    return [ np.power(x, exponent) for x in vals ]

def polystep(vals, lambdaCut):
    return [ np.power(x, 2) if x > lambdaCut else np.power(x, 2) for x in vals ]

In [53]:
#dummy example
nb_samples = 6 #nb of samples
dim_sample = 4
X = np.random.rand(nb_samples,dim_sample)
print(X)
lambdaCut = 1
K = build_K(lambdaCut, linear, X, 2)
print(K)

[[0.78517596 0.19967378 0.51423444 0.59241457]
 [0.04645041 0.60754485 0.17052412 0.06505159]
 [0.94888554 0.96563203 0.80839735 0.30461377]
 [0.09767211 0.68423303 0.44015249 0.12203823]
 [0.49517691 0.03438852 0.9093204  0.25877998]
 [0.66252228 0.31171108 0.52006802 0.54671028]]
[[ 0.99772467 -0.90724979 -0.56719876  1.18768366 -1.51683295 -0.31833761]
 [-0.90724979  0.82497928  0.51576449 -1.07998307  1.3792847   0.28947037]
 [-0.56719876  0.51576449  0.32244811 -0.67518898  0.86230781  0.18097247]
 [ 1.18768366 -1.07998307 -0.67518898  1.41380937 -1.80562611 -0.37894661]
 [-1.51683295  1.3792847   0.86230781 -1.80562611  2.30602918  0.48396616]
 [-0.31833761  0.28947037  0.18097247 -0.37894661  0.48396616  0.10156994]]


# Text dataset

In [54]:
import pickle
import random
from sklearn import svm

In [55]:
# Text Classification Test
with open("../Dataset/textDataset.pickle", "rb") as fp:
    text = pickle.load(fp, encoding="latin-1")

In [56]:
vectors = []
labels = []
for s in text:
    instance = s[0].toarray()
    vectors.append(instance[0])
    labels.append(s[1])

In [9]:
# Dividing in the 2 classes
X1 = []
label_X1 = []
X2 = []
label_X2 = []
for i in range(0, len(labels)):
    if labels[i] == 0:
        X1.append(vectors[i])
        label_X1.append(labels[i])
    else:
        X2.append(vectors[i])
        label_X2.append(labels[i])
length_X1 = len(label_X1)
length_X2 = len(label_X2)

print(length_X1)
print(length_X2)

963
988


# SVM with K matrix

In [64]:
lambdaCut_or_polyDegree = 5
K = build_K(lambdaCut_or_polyDegree, polynomial, vectors, 2, sigma=0.5)

In [72]:
acc = {}
test_indexes = list(range(length_labels))
testX = K[np.ix_(test_indexes, train_indexes)]
testY = labels.copy()
n_labeled = 16
length_labels = len(labels)
length_labeled_over_2 = n_labeled//2
for i in range(0,10):
    #select training data
    trainingX=[]
    trainingY=[]
    train_indexes = []
    counter = 0
    #add n_labeled//2 points from class1
    while counter < length_labeled_over_2:
        index = random.randint(0,length_labels-1)
        label = labels[index]
        if label == 0:
            train_indexes.append(index)
            trainingY.append(label)
            counter+=1
    #add n_labeled//2 points from class2
    counter = 0
    while counter < length_labeled_over_2:
        index = random.randint(0,length_labels-1)
        label = labels[index]
        if label == 1:
            train_indexes.append(index)
            trainingY.append(label)
            counter+=1
    trainingX = K[np.ix_(train_indexes, train_indexes)]

    #train
    clf = svm.SVC(gamma='scale')
    clf.fit(trainingX, trainingY)
    #test

    pred = clf.predict(testX)
    accuracy = np.sum(testY == pred)/len(testY)
    print(accuracy)
    acc[fold] = accuracy
print('Mean of acc: ', sum(v for k,v in acc.items())/len(acc))

0.5064069707842133
0.5079446437724244
0.5033316248077909
0.5053818554587391
0.5064069707842133
0.4935930292157868
0.4935930292157868
0.49410558687852385
0.4925679138903127
0.5007688364941056
Mean of acc:  0.5007688364941056


# Norm

In [8]:
#test
acc = {}

for fold in range(0,100):
    #train samples
    n_labeled = 16
    length2 = n_labeled//2
    trainingX=[]
    trainingY=[]
    for i in range(0, length2):
        index = random.randint(0,length_X1-1)
        trainingX.append(X1[index])
        trainingY.append(label_X1[index])
    for i in range(0, length2):
        index = random.randint(0,length_X2-1)
        trainingX.append(X2[index])
        trainingY.append(label_X2[index])            
    #train
    clf = svm.SVC(gamma='scale')
    clf.fit(trainingX, trainingY)
    
    pred = clf.predict(vectors)
    accuracy = np.sum(labels == pred)/len(labels)
    print(accuracy)
    acc[fold] = accuracy
print('Mean of acc: ', sum(v for k,v in acc.items())/len(acc))

0.6289082521783701
0.5991799077396207
0.7068170169144029
0.7924141465914916
0.5366478728856996
0.7221937467965146
0.787801127626858
0.5658636596617119
0.7626858021527422
0.796514607893388
0.7057919015889288
0.7391081496668375
0.8195797027165556
0.5474115838031779
0.6078933880061507
0.8124038954382368
0.8021527421834956
0.7852383393131728
0.8042029728344439
0.6268580215274219
0.690927729369554
0.7549974372116863
0.7770374167093798
0.7647360328036904
0.779600205023065
0.8329062019477191
0.6868272680676576
0.4971809328549462
0.7785750896975909
0.8472578165043567
0.5361353152229625
0.5786776012301383
0.7811378780112763
0.7396207073295745
0.8011276268580215
0.5207585853408508
0.7734495130702204
0.7027165556125065
0.7662737057919016
0.7011788826242953
0.7186058431573552
0.7437211686314711
0.5771399282419272
0.7514095335725269
0.5966171194259354
0.7754997437211686
0.7406458226550487
0.8293182983085597
0.7042542286007176
0.8165043567401332
0.6217324449000513
0.8026652998462327
0.78831368528959