In [23]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from itertools import product
import functools 
import operator 
import regex as re
import time
from csv import reader

from classifiers import *
from metrics import *
from kernels import *

In [20]:
def features_into_array(path):
    with open(path, 'r') as read_obj:
        csv_reader = reader(read_obj)
        header = next(csv_reader)
        X = list()
        if header != None:
            for row in csv_reader:
                # row variable is a list that represents a row in csv
                X.append(np.array(row[1]))
                
    X = np.array(X) ## dtype might be changed in something more convenient. For now, dtype = "<U1"
    return X

In [176]:
def voting_pred(X, list_K, list_alpha):
    nb_classifiers = len(list_K)
    nb_samples = np.shape(X)[0]
    y_pred = np.zeros((nb_classifiers, nb_samples))
    y_pred_vote = np.zeros(nb_samples)

    for classifier_i in range(nb_classifiers):
        a = np.reshape(list_alpha[classifier_i],-1)
        y_pred[classifier_i] = list_K[classifier_i] @ a >= 0
        ## each classifier makes its predictions
        
    for sample in range(nb_samples):
        nb_votes_1 = len(np.where(y_pred[:,sample]==1)[0])
        if nb_votes_1 > nb_classifiers/2:
            y_pred_vote[sample] = 1
    
    return y_pred_vote

# DATASET 0 

In [24]:
## Kernels from spectrum kernels
PATH = 'data/spectrum/'

Ktr0_s = np.load(PATH + 'K_train0.npy')
Kval0_s = np.load(PATH + 'K_val0.npy')
Kte0_s = np.load(PATH + 'K_te0.npy')

## Kernels from mismatch kernels
PATH = 'data/mismatch/'
Ktr0_m = np.load(PATH + 'K_train0.npy')
Kval0_m = np.load(PATH + 'K_val0.npy')
Kte0_m = np.load(PATH + 'K_te0.npy')


In [68]:
Xtr0_ = features_into_array("data/Xtr0.csv")
Xte0 = features_into_array("data/Xte0.csv")
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

train_idx_0 = np.load('data/train_test_split/train_idx_0.npy').astype(int)
val_idx_0 = np.load('data/train_test_split/val_idx_0.npy').astype(int)

ytr0 = Ytr0[train_idx_0][:,1]
yval0 = Ytr0[val_idx_0][:,1]

Xtr0 = Xtr0_[train_idx_0]
Xval0 = Xtr0_[val_idx_0]


In [165]:
Kval0_all = np.vstack((Kval0_s, Kval0_m))
print(np.shape(Kval0_all))

(12, 400, 1600)


In [166]:
C0_s = [0.002, 0.003, 0.009, 0.01, 0.02, 0.02]
C0_m = [0.0004, 0.0002, 0.0001, 0.0004, 0.0003, 0.0006]
nb_classifier_s = len(C0_s)
nb_classifier_m = len(C0_m)
nb_classifier_tot = nb_classifier_s + nb_classifier_m

alphas_tr0 = np.zeros((nb_classifier_tot, np.shape(ytr0)[0]))
accuracies_val0 = np.zeros(nb_classifier_tot)

for i in range(nb_classifier_s) :
    C = [C0_s[i]]
    alpha_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = SVM(Ktr0_s[i], ytr0, Kval0_s[i], yval0, C)
    alphas_tr0[i] = alpha_tr0[0]
    accuracies_val0[i] = acc_val0[0]
    
for i in range(nb_classifier_m):
    C = [C0_m[i]]
    alpha_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = SVM(Ktr0_m[i], ytr0, Kval0_m[i], yval0, C)
    alphas_tr0[i + nb_classifier_s] = alpha_tr0[0]
    accuracies_val0[i + nb_classifier_s] = acc_val0[0]

---------------  lambda = 0.002  ---------------
Training: loss = 0.686072, accuracy = 0.727500
Validation: loss = 0.798497, accuracy = 0.637500
---------------  lambda = 0.003  ---------------
Training: loss = 0.587996, accuracy = 0.821875
Validation: loss = 0.797911, accuracy = 0.647500
---------------  lambda = 0.009  ---------------
Training: loss = 0.227732, accuracy = 0.970625
Validation: loss = 0.795442, accuracy = 0.657500
---------------  lambda = 0.01  ---------------
Training: loss = 0.126440, accuracy = 0.988125
Validation: loss = 0.842382, accuracy = 0.635000
---------------  lambda = 0.02  ---------------
Training: loss = 0.008491, accuracy = 0.997500
Validation: loss = 0.884466, accuracy = 0.610000
---------------  lambda = 0.02  ---------------
Training: loss = 0.006651, accuracy = 0.998750
Validation: loss = 0.898766, accuracy = 0.592500
---------------  lambda = 0.0004  ---------------
Training: loss = 0.632025, accuracy = 0.718750
Validation: loss = 0.791007, accurac

## Majority voting over all spectrum kernels : 

In [170]:
prediction_voting = voting_pred(Xval0, Kval0_s, alphas_tr0[:nb_classifier_s])
print("Accuracy of voting ", accuracy(yval0, prediction_voting, mode='blabla'))

Accuracy of voting  0.6525


## Majority voting over all mismatch kernels : 

In [175]:
prediction_voting = voting_pred(Xval0, Kval0_m, alphas_tr0[nb_classifier_s:])
print("Accuracy of voting ", accuracy(yval0, prediction_voting, mode='blabla'))

Accuracy of voting  0.6775


## Majority voting over all kernels : 


In [173]:
prediction_voting = voting_pred(Xval0, Kval0_all, alphas_tr0)
print("Accuracy of voting ", accuracy(yval0, prediction_voting, mode='blabla'))

Accuracy of voting  0.6675


## Majority voting over all mismatch kernels : except for k = 5

In [174]:
prediction_voting = voting_pred(Xval0, Kval0_m[1:], alphas_tr0[nb_classifier_s+1:])
print("Accuracy of voting ", accuracy(yval0, prediction_voting, mode='blabla'))

Accuracy of voting  0.6775


# DATASET 1

In [181]:
## Kernels from spectrum kernels
PATH = 'data/spectrum/'

Ktr1_s = np.load(PATH + 'K_train1.npy')
Kval1_s = np.load(PATH + 'K_val1.npy')
Kte1_s = np.load(PATH + 'K_te1.npy')

## Kernels from mismatch kernels
PATH = 'data/mismatch/'
Ktr1_m = np.load(PATH + 'K_train1.npy')
Kval1_m = np.load(PATH + 'K_val1.npy')
Kte1_m = np.load(PATH + 'K_te1.npy')

In [182]:
Xtr1_ = features_into_array("data/Xtr1.csv")
Xte1 = features_into_array("data/Xte1.csv")
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

train_idx_1 = np.load('data/train_test_split/train_idx_1.npy').astype(int)
val_idx_1 = np.load('data/train_test_split/val_idx_1.npy').astype(int)

ytr1 = Ytr1[train_idx_1 - 2000][:,1]
yval1 = Ytr1[val_idx_1 - 2000][:,1]

Xtr1 = Xtr1_[train_idx_1 - 2000]
Xval1 = Xtr1_[val_idx_1 - 2000]

In [184]:
Kval1_all = np.vstack((Kval1_s, Kval1_m))
print(np.shape(Kval1_all))

(12, 400, 1600)


In [185]:
C1_s = [0.005, 0.004, 0.007, 0.009, 0.01, 0.02]
C1_m = [0.0003, 0.0004, 0.0002, 0.0005, 0.0004, 0.0003]
nb_classifier_s = len(C1_s)
nb_classifier_m = len(C1_m)
nb_classifier_tot = nb_classifier_s + nb_classifier_m

alphas_tr1 = np.zeros((nb_classifier_tot, np.shape(ytr1)[0]))
accuracies_val1 = np.zeros(nb_classifier_tot)

for i in range(nb_classifier_s) :
    C = [C1_s[i]]
    alpha_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = SVM(Ktr1_s[i], ytr1, Kval1_s[i], yval1, C)
    alphas_tr1[i] = alpha_tr1[0]
    accuracies_val1[i] = acc_val1[0]
    
for i in range(nb_classifier_m):
    C = [C1_m[i]]
    alpha_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = SVM(Ktr1_m[i], ytr1, Kval1_m[i], yval1, C)
    alphas_tr1[i + nb_classifier_s] = alpha_tr1[0]
    accuracies_val1[i + nb_classifier_s] = acc_val1[0]

---------------  lambda = 0.005  ---------------
Training: loss = 0.559067, accuracy = 0.791875
Validation: loss = 0.791559, accuracy = 0.642500
---------------  lambda = 0.004  ---------------
Training: loss = 0.524851, accuracy = 0.867500
Validation: loss = 0.809016, accuracy = 0.645000
---------------  lambda = 0.007  ---------------
Training: loss = 0.316770, accuracy = 0.974375
Validation: loss = 0.831072, accuracy = 0.660000
---------------  lambda = 0.009  ---------------
Training: loss = 0.168295, accuracy = 0.998125
Validation: loss = 0.866640, accuracy = 0.667500
---------------  lambda = 0.01  ---------------
Training: loss = 0.091733, accuracy = 1.000000
Validation: loss = 0.910978, accuracy = 0.617500
---------------  lambda = 0.02  ---------------
Training: loss = 0.000018, accuracy = 1.000000
Validation: loss = 0.935424, accuracy = 0.585000
---------------  lambda = 0.0003  ---------------
Training: loss = 0.655350, accuracy = 0.730000
Validation: loss = 0.818788, accura

## Majority voting over all spectrum kernels : 

In [186]:
prediction_voting = voting_pred(Xval1, Kval1_s, alphas_tr1[:nb_classifier_s])
print("Accuracy of voting ", accuracy(yval1, prediction_voting, mode='blabla'))

Accuracy of voting  0.6575


## Majority voting over all mismatch kernels : 


In [187]:
prediction_voting = voting_pred(Xval1, Kval1_m, alphas_tr1[nb_classifier_s:])
print("Accuracy of voting ", accuracy(yval1, prediction_voting, mode='blabla'))

Accuracy of voting  0.6825


## Majority voting over all kernels : 

In [188]:
prediction_voting = voting_pred(Xval1, Kval1_all, alphas_tr1)
print("Accuracy of voting ", accuracy(yval1, prediction_voting, mode='blabla'))

Accuracy of voting  0.6775


## Majority voting over all mismatch kernels : except for k = 5

In [189]:
prediction_voting = voting_pred(Xval1, Kval1_m[1:], alphas_tr1[nb_classifier_s+1:])
print("Accuracy of voting ", accuracy(yval1, prediction_voting, mode='blabla'))

Accuracy of voting  0.6775


# DATASET 2

In [193]:
## Kernels from spectrum kernels
PATH = 'data/spectrum/'

Ktr2_s = np.load(PATH + 'K_train2.npy')
Kval2_s = np.load(PATH + 'K_val2.npy')
Kte2_s = np.load(PATH + 'K_te2.npy')

## Kernels from mismatch kernels
PATH = 'data/mismatch/'
Ktr2_m = np.load(PATH + 'K_train2.npy')
Kval2_m = np.load(PATH + 'K_val2.npy')
Kte2_m = np.load(PATH + 'K_te2.npy')

In [194]:
Xtr2_ = features_into_array("data/Xtr2.csv")
Xte2 = features_into_array("data/Xte2.csv")
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)

train_idx_2 = np.load('data/train_test_split/train_idx_2.npy').astype(int)
val_idx_2 = np.load('data/train_test_split/val_idx_2.npy').astype(int)

ytr2 = Ytr2[train_idx_2 - 4000][:,1]
yval2 = Ytr2[val_idx_2 - 4000][:,1]

Xtr2 = Xtr2_[train_idx_2 - 4000]
Xval2 = Xtr2_[val_idx_2 - 4000]

In [195]:
Kval2_all = np.vstack((Kval2_s, Kval2_m))
print(np.shape(Kval2_all))

(12, 400, 1600)


In [196]:
C2_s = [0.0002, 0.0005, 0.0005, 0.0004, 0.0004, 0.001]
C2_m = [0.006, 0.008, 0.01, 0.08, 0.02, 0.02]
nb_classifier_s = len(C2_s)
nb_classifier_m = len(C2_m)
nb_classifier_tot = nb_classifier_s + nb_classifier_m

alphas_tr2 = np.zeros((nb_classifier_tot, np.shape(ytr2)[0]))
accuracies_val2 = np.zeros(nb_classifier_tot)

for i in range(nb_classifier_s) :
    C = [C2_s[i]]
    alpha_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = SVM(Ktr2_s[i], ytr2, Kval2_s[i], yval2, C)
    alphas_tr2[i] = alpha_tr2[0]
    accuracies_val2[i] = acc_val2[0]
    
for i in range(nb_classifier_m):
    C = [C2_m[i]]
    alpha_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = SVM(Ktr2_m[i], ytr2, Kval2_m[i], yval2, C)
    alphas_tr2[i + nb_classifier_s] = alpha_tr2[0]
    accuracies_val2[i + nb_classifier_s] = acc_val2[0]

---------------  lambda = 0.0002  ---------------
Training: loss = 0.854027, accuracy = 0.699375
Validation: loss = 0.867939, accuracy = 0.667500
---------------  lambda = 0.0005  ---------------
Training: loss = 0.780414, accuracy = 0.719375
Validation: loss = 0.812058, accuracy = 0.657500
---------------  lambda = 0.0005  ---------------
Training: loss = 0.829778, accuracy = 0.839375
Validation: loss = 0.857525, accuracy = 0.675000
---------------  lambda = 0.0004  ---------------
Training: loss = 0.883438, accuracy = 0.956875
Validation: loss = 0.903911, accuracy = 0.702500
---------------  lambda = 0.0004  ---------------
Training: loss = 0.900441, accuracy = 0.987500
Validation: loss = 0.923077, accuracy = 0.707500
---------------  lambda = 0.001  ---------------
Training: loss = 0.814258, accuracy = 0.992500
Validation: loss = 0.880315, accuracy = 0.697500
---------------  lambda = 0.006  ---------------
Training: loss = 0.341788, accuracy = 0.861250
Validation: loss = 0.760811, 

## Majority voting over all spectrum kernels : 

In [197]:
prediction_voting = voting_pred(Xval2, Kval2_s, alphas_tr2[:nb_classifier_s])
print("Accuracy of voting ", accuracy(yval2, prediction_voting, mode='blabla'))

Accuracy of voting  0.705


## Majority voting over all mismatch kernels : 



In [198]:
prediction_voting = voting_pred(Xval2, Kval2_m, alphas_tr2[nb_classifier_s:])
print("Accuracy of voting ", accuracy(yval2, prediction_voting, mode='blabla'))

Accuracy of voting  0.7575


## Majority voting over all kernels : 

In [199]:
prediction_voting = voting_pred(Xval2, Kval2_all, alphas_tr2)
print("Accuracy of voting ", accuracy(yval2, prediction_voting, mode='blabla'))

Accuracy of voting  0.73


## Majority voting over all mismatch kernels : except for k = 5

In [200]:
prediction_voting = voting_pred(Xval2, Kval2_m[1:], alphas_tr2[nb_classifier_s+1:])
print("Accuracy of voting ", accuracy(yval2, prediction_voting, mode='blabla'))

Accuracy of voting  0.77
