In [50]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from itertools import product
import functools 
import operator 
import regex as re
import time

from classifiers import *
from metrics import *
from kernels import *

from sklearn.model_selection import train_test_split # lui il va partir mais pour l'instant c'est pratique


# Load the data

In [51]:
from csv import reader

def features_into_array(path):
    with open(path, 'r') as read_obj:
        csv_reader = reader(read_obj)
        header = next(csv_reader)
        X = list()
        if header != None:
            for row in csv_reader:
                # row variable is a list that represents a row in csv
                X.append(np.array(row[1]))
                
    X = np.array(X) ## dtype might be changed in something more convenient. For now, dtype = "<U1"
    return X

In [52]:
Xtr0 = features_into_array("data/Xtr0.csv")
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

Xtr1 = features_into_array("data/Xtr1.csv")
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

Xtr2 = features_into_array("data/Xtr2.csv")
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)

In [53]:
Xtr0[0]

'TCCTGTGCACATCTGCACCCCTGTTGTGGCCACAAAATGATCCGGCACCACCCAGTGGGAGACGACAGAGGTGGCAATGGGGTGTCGGCTCTGACGCCTCC'

## Spectrum kernel

For a fixed value k (that needs to be tuned), the k-spectrum kernel is defined as : 


\begin{align*}
K(x,x^{\prime}) := \sum_{u \in \mathcal{A}^k} \phi_{u}(x) \phi_{u}(x^{\prime})
\end{align*}

In [54]:
def all_possible_substrings(k):
    """
    With a k spectrum kernel, let us find all the possible combinations of chars of size k in the sequence x
    This way, we could index them in the sequence x
    """
    char_list = list(['A', 'C','G','T'])
    alphabet_tuples = list(product(char_list,repeat=k))
    alphabet = dict()
    idx=0
    for i in alphabet_tuples:
        alphabet[functools.reduce(operator.add, (i))] = idx
        idx += 1
        #alphabet.append(functools.reduce(operator.add, (i)))
    return alphabet

In [55]:
dict6 = all_possible_substrings(3)

In [56]:
def pre_indexing(X, k, alphabet=None):
    """
    Outputs a sparse matrix of shape Transforms an input array into a sparse matrix encoding the number of occurences of each letter of
    the alphabet composed of substrings of size k
    """
    i = 0
    n = X.shape[0]
    if alphabet is None:
        alphabet = all_possible_substrings(k)
    D = np.zeros((n,len(alphabet)))
    
    for i in range(X.shape[0]):
        idx=0
        while idx + k < len(X[i]):
            D[i, alphabet[X[i][idx:idx+k]]] += 1
            idx += 1
    """
    for x in X:
        d = dict((letter, len(re.findall(letter, x, overlapped=True))) 
                             for letter in alphabet)
        data = np.array(list(d.items()))
        D[i] = data[:,1]
        i+=1
    """
    D = csr_matrix(D, dtype = int)
    return D

In [57]:
k = 6
start_time = time.time()
alphabet_6 = all_possible_substrings(k)
mm = pre_indexing(Xtr0, 6, alphabet=alphabet_6)
print("--- Found alphabet in %s seconds ---" % (time.time() - start_time))


--- Found alphabet in 1.0961740016937256 seconds ---


In [58]:
def spectrum_kernel(X_train, X_val, X_test, k, alphabet=None):
    # Kill two birds with one stone and compute K_train, K_val and K_test all at once.
    """
    Computes the spectrum kernels for X_train (n_train x n_train), X_validation and X_test
    (on the RKHS generated by X_train's samples) which is of shape n_validation x n_train (resp n_test x n_train)
    """
    if alphabet is None:
        #D_train = pre_indexing(X_train,k).toarray()
        #D_val = pre_indexing(X_val,k).toarray()
        alphabet = all_possible_substrings(k)
   
    D_train = pre_indexing(X_train,k,alphabet)
    D_val = pre_indexing(X_val,k,alphabet)
    D_test = pre_indexing(X_test,k,alphabet)
        
        
    #K_val = np.inner(D_val, D_train)
    #K_val = K_val.astype('float')
    
    K_train = D_train.dot(D_train.transpose())
    K_train = K_train.toarray().astype('float')
    
    K_val = D_val.dot(D_train.transpose())
    K_val = K_val.toarray().astype('float')
    
    K_test = D_test.dot(D_train.transpose())
    K_test = K_test.toarray().astype('float')
    
        
    return(K_train, K_val, K_test)

# Compute the spectrum-kernels for our data

Use the precomputed train_test_split that was used to compute the mismatch kernels in the other notebook.

In [59]:
#Xtr0_, Xval0_, ytr0, yval0 = train_test_split(Xtr0, Ytr0, test_size=0.2, random_state=42)
#Xtr1_, Xval1_, ytr1, yval1 = train_test_split(Xtr1, Ytr1, test_size=0.2, random_state=42)
#Xtr2_, Xval2_, ytr2, yval2 = train_test_split(Xtr2, Ytr2, test_size=0.2, random_state=42)
#
train_idx_0 = np.load("train_test_split/train_idx_0.npy").astype(int)
train_idx_1 = np.load("train_test_split/train_idx_1.npy").astype(int)
train_idx_2 = np.load("train_test_split/train_idx_2.npy").astype(int)

val_idx_0 = np.load("train_test_split/val_idx_0.npy").astype(int)
val_idx_1 = np.load("train_test_split/val_idx_1.npy").astype(int)
val_idx_2 = np.load("train_test_split/val_idx_2.npy").astype(int)

Xtr0_ = Xtr0[train_idx_0]
Xtr1_ = Xtr1[train_idx_1 - 2000]
Xtr2_ = Xtr2[train_idx_2 - 4000]

ytr0 = Ytr0[train_idx_0]
ytr1 = Ytr1[train_idx_1 - 2000]
ytr2 = Ytr2[train_idx_2 - 4000]


Xval0_ = Xtr0[val_idx_0]
Xval1_ = Xtr1[val_idx_1 - 2000]
Xval2_ = Xtr2[val_idx_2 - 4000]

yval0 = Ytr0[val_idx_0]
yval1 = Ytr1[val_idx_1 - 2000]
yval2 = Ytr2[val_idx_2 - 4000]


In [60]:
Xte0 = features_into_array("data/Xte0.csv")
Xte1 = features_into_array("data/Xte1.csv")
Xte2 = features_into_array("data/Xte2.csv")

In [62]:
k = 8
start_time = time.time()
alphabet_8 = all_possible_substrings(k)
print("--- Found alphabet in %s seconds ---" % (time.time() - start_time))
start_time = time.time()

K_tr0, K_val0, K_te0 = spectrum_kernel(Xtr0_, Xval0_, Xte0, k, alphabet=alphabet_8,)
print("--- Computed kernel in %s seconds ---" % (time.time() - start_time))
start_time = time.time()
K_tr1, K_val1, K_te1 = spectrum_kernel(Xtr1_, Xval1_, Xte1, k, alphabet=alphabet_8)
print("--- Computed kernel in %s seconds ---" % (time.time() - start_time))
start_time = time.time()
K_tr2, K_val2, K_te2 = spectrum_kernel(Xtr2_, Xval2_, Xte2, k, alphabet=alphabet_8)
print("--- Computed kernel in %s seconds ---" % (time.time() - start_time))

--- Found alphabet in 0.24821901321411133 seconds ---
--- Computed kernel in 6.105154037475586 seconds ---
--- Computed kernel in 6.009521245956421 seconds ---
--- Computed kernel in 7.061125040054321 seconds ---


# Training-Validation
(runs to make sure everything is ok)

In [17]:
lambdas = [0] + [10**i for i in range(-10,2)]

print("************* KRR for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = KRR(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas)
print("")
print("")
print("************* SVM for dataset 1 *************\n")
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = KRR(K_tr1, ytr1[:,1], K_val1, yval1[:,1],lambdas)

print("")
print("")
print("************* SVM for dataset 2 *************\n")
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = KRR(K_tr2, ytr2[:,1], K_val2, yval2[:,1],lambdas)


************* KRR for dataset 0*************

***********lambda = 0***********
Training: loss = 803.2332, accuracy = 1.000000
Validation: loss = 121.7379, accuracy = 0.585000
***********lambda = 1e-10***********
Training: loss = 803.2332, accuracy = 1.000000
Validation: loss = 121.7379, accuracy = 0.585000
***********lambda = 1e-09***********
Training: loss = 803.2332, accuracy = 1.000000
Validation: loss = 121.7379, accuracy = 0.585000
***********lambda = 1e-08***********
Training: loss = 803.2330, accuracy = 1.000000
Validation: loss = 121.7379, accuracy = 0.585000
***********lambda = 1e-07***********
Training: loss = 803.2318, accuracy = 1.000000
Validation: loss = 121.7378, accuracy = 0.585000
***********lambda = 1e-06***********
Training: loss = 803.2193, accuracy = 1.000000
Validation: loss = 121.7375, accuracy = 0.585000
***********lambda = 1e-05***********
Training: loss = 803.0939, accuracy = 1.000000
Validation: loss = 121.7344, accuracy = 0.585000
***********lambda = 0.0001*

In [18]:
lambdas = [10**i for i in range(-4,1)]

print("*************KLR for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = KLR(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas, tresh=1e-8)
print("")
print("")
print("************* SVM for dataset 1 *************\n")
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = KLR(K_tr1, ytr1[:,1], K_val1, yval1[:,1],lambdas, tresh=1e-8)

print("")
print("")
print("************* SVM for dataset 2 *************\n")
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = KLR(K_tr2, ytr2[:,1], K_val2, yval2[:,1],lambdas, tresh= 1e-8)


*************KLR for dataset 0*************

***********lambda = 0.0001***********
Training: loss = 0.3155, accuracy = 1.000000
Validation: loss = 0.6535, accuracy = 0.595000
***********lambda = 0.001***********
Training: loss = 0.3338, accuracy = 1.000000
Validation: loss = 0.6525, accuracy = 0.602500
***********lambda = 0.01***********
Training: loss = 0.4359, accuracy = 0.996875
Validation: loss = 0.6513, accuracy = 0.622500
***********lambda = 0.1***********
Training: loss = 0.6083, accuracy = 0.978750
Validation: loss = 0.6642, accuracy = 0.632500
***********lambda = 1***********
Training: loss = 0.6782, accuracy = 0.951875
Validation: loss = 0.6850, accuracy = 0.610000


************* SVM for dataset 1 *************

***********lambda = 0.0001***********
Training: loss = 0.3156, accuracy = 1.000000
Validation: loss = 0.6436, accuracy = 0.640000
***********lambda = 0.001***********
Training: loss = 0.3343, accuracy = 1.000000
Validation: loss = 0.6448, accuracy = 0.637500
********

In [19]:
lambdas = [10**i for i in range(-4, 3)]
print("************* SVM for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = SVM(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas)
print("")
print("")
print("************* SVM for dataset 1 *************\n")
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = SVM(K_tr1, ytr1[:,1], K_val1, yval1[:,1],lambdas)

print("")
print("")
print("************* SVM for dataset 2 *************\n")
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = SVM(K_tr2, ytr2[:,1], K_val2, yval2[:,1],lambdas)


************* SVM for dataset 0*************

---------------  lambda = 0.0001  ---------------
Training: loss = 0.977640, accuracy = 0.936250
Validation: loss = 0.986505, accuracy = 0.605000
---------------  lambda = 0.001  ---------------
Training: loss = 0.813766, accuracy = 0.956250
Validation: loss = 0.902269, accuracy = 0.617500
---------------  lambda = 0.01  ---------------
Training: loss = 0.126440, accuracy = 0.988125
Validation: loss = 0.842382, accuracy = 0.635000
---------------  lambda = 0.1  ---------------
Training: loss = 0.000000, accuracy = 1.000000
Validation: loss = 0.881470, accuracy = 0.602500
---------------  lambda = 1  ---------------
Training: loss = 0.000000, accuracy = 1.000000
Validation: loss = 0.881467, accuracy = 0.602500
---------------  lambda = 10  ---------------
Training: loss = 0.000000, accuracy = 1.000000
Validation: loss = 0.881474, accuracy = 0.602500
---------------  lambda = 100  ---------------
Training: loss = 0.000000, accuracy = 1.000000

## Gridsearch on SVM

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
# sanity check to see that our SVM does a good job
# C = 1/(2*n*lambda) 
from sklearn import svm
clf = svm.SVC(kernel='precomputed', C =1.)
print("*"*15  + "Sklearn SVM on dataset 0 " + "*"*15)
clf.fit(K_tr0, ytr0[:,1])
print(clf.score(K_tr0, ytr0[:,1]))
print(clf.score(K_val0, yval0[:,1]))
lambd_C = [1]
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = SVM(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambd_C)

print("")
print("")
print("")

print("*"*15  + "Sklearn SVM on dataset 1 " + "*"*15)
clf.fit(K_tr1, ytr1[:,1])
print(clf.score(K_tr1, ytr1[:,1]))
print(clf.score(K_val1, yval1[:,1]))
lambd_C = [1]
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = SVM(K_tr1, ytr1[:,1], K_val1, yval1[:,1], lambd_C)

print("")
print("")
print("")

print("*"*15  + "Sklearn SVM on dataset 2 " + "*"*15)
clf.fit(K_tr2, ytr2[:,1])
print(clf.score(K_tr2, ytr2[:,1]))
print(clf.score(K_val2, yval2[:,1]))
lambd_C = [1]
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = SVM(K_tr2, ytr2[:,1], K_val2, yval2[:,1], lambd_C)


***************Sklearn SVM on dataset 0 ***************


NameError: name 'K_tr0' is not defined

### Okay cool almost there let's do gridsearch

#### For dataset 1

In [25]:
Kernels_tr_0 = []
Kernels_val_0 = []
Kernels_te_0 = []

Kernels_tr_1 = []
Kernels_val_1 = []
Kernels_te_1 = []

Kernels_tr_2 = []
Kernels_val_2 = []
Kernels_te_2 = []


for k in range(3,11):
    print("*"*15 + "treating k=" + str(k) + 15*"*")
    start_time = time.time()
    alphabet_k = all_possible_substrings(k)
    print("--- Found alphabet in %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    K_tr0, K_val0, K_te0 = spectrum_kernel(Xtr0_, Xval0_, Xte0, k, alphabet=alphabet_k)
    Kernels_tr_0 += [K_tr0]
    Kernels_val_0 += [K_val0]
    Kernels_te_0 += [K_te0]
    
    K_tr1, K_val1, K_te1 = spectrum_kernel(Xtr1_, Xval1_, Xte1, k, alphabet=alphabet_k)
    Kernels_tr_1 += [K_tr1]
    Kernels_val_1 += [K_val1]
    Kernels_te_1 += [K_te1]
    
    K_tr2, K_val2, K_te2 = spectrum_kernel(Xtr2_, Xval2_, Xte2, k, alphabet=alphabet_k)
    Kernels_tr_2 += [K_tr2]
    Kernels_val_2 += [K_val2]
    Kernels_te_2 += [K_te2]
    
    print("--- Computed all the kernels in %s seconds ---" % (time.time() - start_time))
    print("")
    print("")
    print("")
    

***************treating k=3***************
--- Found alphabet in 0.0001838207244873047 seconds ---
--- Computed all the kernels in 4.689929246902466 seconds ---



***************treating k=4***************
--- Found alphabet in 0.0005118846893310547 seconds ---
--- Computed all the kernels in 4.623594045639038 seconds ---



***************treating k=5***************
--- Found alphabet in 0.0011050701141357422 seconds ---
--- Computed all the kernels in 4.013540029525757 seconds ---



***************treating k=6***************
--- Found alphabet in 0.005137205123901367 seconds ---
--- Computed all the kernels in 4.9780778884887695 seconds ---



***************treating k=7***************
--- Found alphabet in 0.03323197364807129 seconds ---
--- Computed all the kernels in 6.995905876159668 seconds ---



***************treating k=8***************
--- Found alphabet in 0.16201400756835938 seconds ---
--- Computed all the kernels in 16.233572721481323 seconds ---



***************trea

In [49]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

values_C = [j*10**i for i in range(-5,3) for j in range(1,10)]
parameters = {'C': values_C}
svm__ = svm.SVC(kernel='precomputed')
gs_k = GridSearchCV(svm__, param_grid=parameters, refit=True, verbose=0)

for k in range(2,9):
    
        
    print("*"*15 + "treating k = " + str(k+3) + 15*"*")
    print("")
    print("-"*15 + " treating dataset 0 "+ 15*"-")
    
    gs_k.fit(Kernels_tr_0[k], ytr0[:,1])
    print(gs_k.best_estimator_)
    print(f"training score for k = {k+3} ", gs_k.score(Kernels_tr_0[k], ytr0[:,1]))
    print(f"validation score for k = {k+3} ", gs_k.score(Kernels_val_0[k], yval0[:,1]))
    
    print("")
    print("-"*15 + " treating dataset 1 "+ 15*"-")
    
    gs_k.fit(Kernels_tr_1[k], ytr1[:,1])
    print(gs_k.best_estimator_)
    print(f"training score for k = {k+3} ", gs_k.score(Kernels_tr_1[k], ytr1[:,1]))
    print(f"validation score for k = {k+3} ", gs_k.score(Kernels_val_1[k], yval1[:,1]))
    
    print("")
    print("-"*15 + " treating dataset 2 "+ 15*"-")
    
    gs_k.fit(Kernels_tr_2[k], ytr2[:,1])
    print(gs_k.best_estimator_)
    print(f"training score for k = {k+3} ", gs_k.score(Kernels_tr_2[k], ytr2[:,1]))
    print(f"validation score for k = {k+3} ", gs_k.score(Kernels_val_2[k], yval2[:,1]))
    
    print("")
    print("")

***************treating k = 5***************

--------------- treating dataset 0 ---------------
SVC(C=0.002, kernel='precomputed')
training score for k = 5  0.72
validation score for k = 5  0.65

--------------- treating dataset 1 ---------------
SVC(C=0.005, kernel='precomputed')
training score for k = 5  0.791875
validation score for k = 5  0.64

--------------- treating dataset 2 ---------------
SVC(C=0.006, kernel='precomputed')
training score for k = 5  0.836875
validation score for k = 5  0.7075


***************treating k = 6***************

--------------- treating dataset 0 ---------------
SVC(C=0.003, kernel='precomputed')
training score for k = 6  0.791875
validation score for k = 6  0.6425

--------------- treating dataset 1 ---------------
SVC(C=0.004, kernel='precomputed')
training score for k = 6  0.86625
validation score for k = 6  0.6375

--------------- treating dataset 2 ---------------
SVC(C=0.008, kernel='precomputed')
training score for k = 6  0.926875
validation

IndexError: list index out of range

Saving the train kernels

In [37]:
KTR0 = np.asarray(Kernels_tr_0[2:])
KTR1 = np.asarray(Kernels_tr_1[2:])
KTR2 = np.asarray(Kernels_tr_2[2:])

In [38]:
print(KTR0.shape)
print(KTR1.shape)
print(KTR2.shape)

(6, 1600, 1600)
(6, 1600, 1600)
(6, 1600, 1600)


In [39]:
np.save("spectrum/K_train0.npy", KTR0)
np.save("spectrum/K_train1.npy", KTR1)
np.save("spectrum/K_train2.npy", KTR2)

Saving the validation kernels

In [40]:
KVAL0 = np.asarray(Kernels_val_0[2:])
KVAL1 = np.asarray(Kernels_val_1[2:])
KVAL2 = np.asarray(Kernels_val_2[2:])

In [41]:
print(KVAL0.shape)
print(KVAL1.shape)
print(KVAL2.shape)

(6, 400, 1600)
(6, 400, 1600)
(6, 400, 1600)


In [42]:
np.save("spectrum/K_val0.npy", KVAL0)
np.save("spectrum/K_val1.npy", KVAL1)
np.save("spectrum/K_val2.npy", KVAL2)

Saving the test kernels

In [46]:
KTE0 = np.asarray(Kernels_te_0[2:])
KTE1 = np.asarray(Kernels_te_1[2:])
KTE2 = np.asarray(Kernels_te_2[2:])

In [47]:
print(KTE0.shape)
print(KTE1.shape)
print(KTE2.shape)

(6, 1000, 1600)
(6, 1000, 1600)
(6, 1000, 1600)


In [48]:
np.save("spectrum/K_te0.npy", KTE0)
np.save("spectrum/K_te1.npy", KTE1)
np.save("spectrum/K_te2.npy", KTE2)

Let's do GridSearch using KernelRidge

In [25]:
from sklearn.kernel_ridge import KernelRidge

values_alpha = [j*10**i for i in range(-5,3) for j in range(1,10)]
parameters = {'alpha': values_alpha}
krr = KernelRidge(kernel='precomputed')
gs_krr = GridSearchCV(krr, param_grid=parameters, refit=True, verbose=0)

for k in range(2,8):
    
        
    print("*"*15 + "treating k = " + str(k+3) + 15*"*")
    print("")
    print("-"*15 + " treating dataset 0 "+ 15*"-")
    
    gs_krr.fit(Kernels_tr_0[k], ytr0[:,1])
    print(gs_krr.best_estimator_)
    print(f"training score for k = {k+3} ", gs_krr.score(Kernels_tr_0[k], ytr0[:,1]))
    print(f"validation score for k = {k+3} ", gs_krr.score(Kernels_val_0[k], yval0[:,1]))
    
    print("")
    print("-"*15 + " treating dataset 1 "+ 15*"-")
    
    gs_krr.fit(Kernels_tr_1[k], ytr1[:,1])
    print(gs_krr.best_estimator_)
    print(f"training score for k = {k+3} ", gs_krr.score(Kernels_tr_1[k], ytr1[:,1]))
    print(f"validation score for k = {k+3} ", gs_krr.score(Kernels_val_1[k], yval1[:,1]))
    
    print("")
    print("-"*15 + " treating dataset 2 "+ 15*"-")
    
    gs_krr.fit(Kernels_tr_2[k], ytr2[:,1])
    print(gs_krr.best_estimator_)
    print(f"training score for k = {k+3} ", gs_krr.score(Kernels_tr_2[k], ytr2[:,1]))
    print(f"validation score for k = {k+3} ", gs_krr.score(Kernels_val_2[k], yval2[:,1]))
    
    print("")
    print("")

***************treating k = 5***************

--------------- treating dataset 0 ---------------
KernelRidge(alpha=500, kernel='precomputed')
training score for k = 5  0.28300186354446866
validation score for k = 5  0.1223365457749519

--------------- treating dataset 1 ---------------
KernelRidge(alpha=300, kernel='precomputed')
training score for k = 5  0.36037269519040716
validation score for k = 5  0.1328436796250635

--------------- treating dataset 2 ---------------
KernelRidge(alpha=400, kernel='precomputed')
training score for k = 5  0.42481870027422874
validation score for k = 5  0.2579091857473338


***************treating k = 6***************

--------------- treating dataset 0 ---------------
KernelRidge(alpha=300, kernel='precomputed')
training score for k = 6  0.42659522546433337
validation score for k = 6  0.11325856461831074

--------------- treating dataset 1 ---------------
KernelRidge(alpha=200, kernel='precomputed')
training score for k = 6  0.517923299627612
valida

forget it its trash

# lets do for k = 11 and 12 too

    

In [None]:
Kernels_tr_0_12 = []
Kernels_val_0_12 = []
Kernels_te_0_12 = []

Kernels_tr_1_12 = []
Kernels_val_1_12 = []
Kernels_te_1_12 = []

Kernels_tr_2_12 = []
Kernels_val_2_12 = []
Kernels_te_2_12 = []


for k in range(11,13):
    print("*"*15 + "treating k=" + str(k) + 15*"*")
    start_time = time.time()
    alphabet_k = all_possible_substrings(k)
    print("--- Found alphabet in %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    K_tr0, K_val0, K_te0 = spectrum_kernel(Xtr0_, Xval0_, Xte0, k, alphabet=alphabet_k)
    Kernels_tr_0_12 += [K_tr0]
    Kernels_val_0_12 += [K_val0]
    Kernels_te_0_12 += [K_te0]
    
    K_tr1, K_val1, K_te1 = spectrum_kernel(Xtr1_, Xval1_, Xte1, k, alphabet=alphabet_k)
    Kernels_tr_1_12 += [K_tr1]
    Kernels_val_1_12 += [K_val1]
    Kernels_te_1_12 += [K_te1]
    
    K_tr2, K_val2, K_te2 = spectrum_kernel(Xtr2_, Xval2_, Xte2, k, alphabet=alphabet_k)
    Kernels_tr_2_12 += [K_tr2]
    Kernels_val_2_12 += [K_val2]
    Kernels_te_2_12 += [K_te2]
    
    print("--- Computed all the kernels in %s seconds ---" % (time.time() - start_time))
    print("")
    print("")
    print("")

***************treating k=11***************
--- Found alphabet in 11.176947116851807 seconds ---


In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

values_C = [j*10**i for i in range(-5,3) for j in range(1,10)]
parameters = {'C': values_C}
svm__ = svm.SVC(kernel='precomputed')
gs_k = GridSearchCV(svm__, param_grid=parameters, refit=True, verbose=0)

for k in range(2):
    
        
    print("*"*15 + "treating k = " + str(k+11) + 15*"*")
    print("")
    print("-"*15 + " treating dataset 0 "+ 15*"-")
    
    gs_k.fit(Kernels_tr_0_12[k], ytr0[:,1])
    print(gs_k.best_estimator_)
    print(f"training score for k = {k+3} ", gs_k.score(Kernels_tr_0_12[k], ytr0[:,1]))
    print(f"validation score for k = {k+3} ", gs_k.score(Kernels_val_0_12[k], yval0[:,1]))
    
    print("")
    print("-"*15 + " treating dataset 1 "+ 15*"-")
    
    gs_k.fit(Kernels_tr_1_12[k], ytr1[:,1])
    print(gs_k.best_estimator_)
    print(f"training score for k = {k+3} ", gs_k.score(Kernels_tr_1_12[k], ytr1[:,1]))
    print(f"validation score for k = {k+3} ", gs_k.score(Kernels_val_1_12[k], yval1[:,1]))
    
    print("")
    print("-"*15 + " treating dataset 2 "+ 15*"-")
    
    gs_k.fit(Kernels_tr_2_12[k], ytr2[:,1])
    print(gs_k.best_estimator_)
    print(f"training score for k = {k+3} ", gs_k.score(Kernels_tr_2_12[k], ytr2[:,1]))
    print(f"validation score for k = {k+3} ", gs_k.score(Kernels_val_2_12[k], yval2[:,1]))
    
    print("")
    print("")

# Save the best model

In [20]:
start_time = time.time()
alphabet_7 = all_possible_substrings(7)
print("--- Found alphabet in 7%s seconds ---" % (time.time() - start_time))
start_time = time.time()
alphabet_9 = all_possible_substrings(9)
print("--- Found alphabet in 9%s seconds ---" % (time.time() - start_time))
start_time = time.time()

K_tr0, K_val0, K_te0 = spectrum_kernel(Xtr0_, Xval0_, Xte0, 7, alphabet=alphabet_7)
K_tr1, K_val1, K_te1 = spectrum_kernel(Xtr1_, Xval1_, Xte1, 7, alphabet=alphabet_7)
K_tr2, K_val2, K_te2 = spectrum_kernel(Xtr2_, Xval2_, Xte2, 9, alphabet=alphabet_9)
    

--- Found alphabet in 70.059677839279174805 seconds ---
--- Found alphabet in 90.7294759750366211 seconds ---


In [21]:
# sanity check to see that our SVM does a good job
from sklearn import svm
clf0 = svm.SVC(kernel='precomputed', C = 0.009)
clf1 = svm.SVC(kernel='precomputed', C = 0.007)
clf2 = svm.SVC(kernel='precomputed', C = 0.08)

print("*"*15  + "Sklearn SVM on dataset 0 " + "*"*15)
clf0.fit(K_tr0, ytr0[:,1])
print(clf0.score(K_tr0, ytr0[:,1]))
print(clf0.score(K_val0, yval0[:,1]))
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = SVM(K_tr0, ytr0[:,1], K_val0, yval0[:,1], [0.009])

print("")
print("")
print("")

print("*"*15  + "Sklearn SVM on dataset 1 " + "*"*15)
clf1.fit(K_tr1, ytr1[:,1])
print(clf1.score(K_tr1, ytr1[:,1]))
print(clf1.score(K_val1, yval1[:,1]))
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = SVM(K_tr1, ytr1[:,1], K_val1, yval1[:,1], [0.007])

print("")
print("")
print("")

print("*"*15  + "Sklearn SVM on dataset 2 " + "*"*15)
clf2.fit(K_tr2, ytr2[:,1])
print(clf2.score(K_tr2, ytr2[:,1]))
print(clf2.score(K_val2, yval2[:,1]))
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = SVM(K_tr2, ytr2[:,1], K_val2, yval2[:,1], [0.08])


***************Sklearn SVM on dataset 0 ***************
0.968125
0.6725
---------------  lambda = 0.009  ---------------
Training: loss = 0.227732, accuracy = 0.970625
Validation: loss = 0.795442, accuracy = 0.657500



***************Sklearn SVM on dataset 1 ***************
0.97375
0.6625
---------------  lambda = 0.007  ---------------
Training: loss = 0.316770, accuracy = 0.974375
Validation: loss = 0.831072, accuracy = 0.660000



***************Sklearn SVM on dataset 2 ***************
1.0
0.7225
---------------  lambda = 0.08  ---------------
Training: loss = 0.000222, accuracy = 1.000000
Validation: loss = 0.732678, accuracy = 0.717500


In [50]:
import numpy as np
import pandas as pd

def write_predictions_csv_good(test_kernels, test_alphas, path, mode="SVM"):
    
    n = test_kernels[0].shape[0]
    print(n)
    predictions = np.zeros(3*n, dtype=int)
    
    for i in range(3):
        y_pred = test_kernels[i] @ test_alphas[i]
        if mode == 'SVM':
            print("entered mode SVM")
            y_pred_ = np.ones(n)
            y_pred_[y_pred < 0] = 0
        else:
            y_pred_ = np.zeros(n)
            y_pred_[y_pred >= 0.5] = 1
   
        predictions[n*i:n*(i+1)] = y_pred_
    
    #predictions = predictions.astype(int)
    pred = pd.DataFrame({"Bound" : predictions})
    print("saving predictions")
    pred.to_csv(path, index=True,index_label="Id")
    #np.savetxt("data/Ytest_KRR.csv", predictions, header = "Id, Bound", delimiter =",")
    print("saved predictions")
    return(predictions)