In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from itertools import product
import functools 
import operator 
import regex as re
import time

from classifiers import *
from metrics import *
from kernels import *

from sklearn.model_selection import train_test_split # lui il va partir mais pour l'instant c'est pratique


# Load the data

In [2]:
from csv import reader

def features_into_array(path):
    with open(path, 'r') as read_obj:
        csv_reader = reader(read_obj)
        header = next(csv_reader)
        X = list()
        if header != None:
            for row in csv_reader:
                # row variable is a list that represents a row in csv
                X.append(np.array(row[1]))
                
    X = np.array(X) ## dtype might be changed in something more convenient. For now, dtype = "<U1"
    return X

In [3]:
Xtr0 = features_into_array("data/Xtr0.csv")
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

Xtr1 = features_into_array("data/Xtr1.csv")
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

Xtr2 = features_into_array("data/Xtr2.csv")
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)

In [4]:
Xtr0[0]

'TCCTGTGCACATCTGCACCCCTGTTGTGGCCACAAAATGATCCGGCACCACCCAGTGGGAGACGACAGAGGTGGCAATGGGGTGTCGGCTCTGACGCCTCC'

## Spectrum kernel

For a fixed value k (that needs to be tuned), the k-spectrum kernel is defined as : 


\begin{align*}
K(x,x^{\prime}) := \sum_{u \in \mathcal{A}^k} \phi_{u}(x) \phi_{u}(x^{\prime})
\end{align*}

In [21]:
def all_possible_substrings(k):
    """
    With a k spectrum kernel, let us find all the possible combinations of chars of size k in the sequence x
    This way, we could index them in the sequence x
    """
    char_list = list(['A', 'C','G','T'])
    alphabet_tuples = list(product(char_list,repeat=k))
    alphabet = dict()
    idx=0
    for i in alphabet_tuples:
        alphabet[functools.reduce(operator.add, (i))] = idx
        idx += 1
        #alphabet.append(functools.reduce(operator.add, (i)))
    return alphabet

In [52]:
## example

len(all_possible_substrings(6))

4096

In [8]:
## TODO : a function that computes occurences 
## with overlapping option without calling regex if we have remaining time (lol)

def pre_indexing_by_sequence(x,k):
    alphabet = all_possible_substrings(k)
    return dict((letter, len(re.findall(letter, x, overlapped=True))) for letter in alphabet)

In [38]:
def pre_indexing(X, k, alphabet=None):
    """
    Transforms an input array into a sparse matrix encoding the number of occurences of each letter of
    the alphabet composed of substrings of size k
    """
    i = 0
    n = X.shape[0]
    if alphabet is None:
        alphabet = all_possible_substrings(k)
    D = np.zeros((n,len(alphabet)))
    
    for i in range(X.shape[0]):
        idx=0
        while idx + k < len(X[i]):
            D[i, alphabet[X[i][idx:idx+k]]] += 1
            idx += 1
    """
    for x in X:
        d = dict((letter, len(re.findall(letter, x, overlapped=True))) 
                             for letter in alphabet)
        data = np.array(list(d.items()))
        D[i] = data[:,1]
        i+=1
    """
    D = csr_matrix(D, dtype = int)
    return D

In [48]:
## example
pre_indexing(Xtr0,8).toarray().shape

(2000, 65536)

In [42]:
k = 6
start_time = time.time()
alphabet_6 = all_possible_substrings(k)
mm = pre_indexing(Xtr0, 6, alphabet=alphabet_6)
print("--- Found alphabet in %s seconds ---" % (time.time() - start_time))


--- Found alphabet in 0.760915994644165 seconds ---


In [79]:
def spectrum_kernel(X_train, X_val, X_test, k, alphabet=None):
    # Kill two birds with one stone and compute K_train, K_val and K_test all at once.
    """
    Computes the spectrum kernels for X_train (n_train x n_train), X_validation and X_test
    (on the RKHS generated by X_train's samples) which is of shape n_validation x n_train (resp n_test x n_train)
    """
    if alphabet is None:
        #D_train = pre_indexing(X_train,k).toarray()
        #D_val = pre_indexing(X_val,k).toarray()
        D_train = pre_indexing(X_train,k)
        D_val = pre_indexing(X_val,k)
        D_test = pre_indexing(X_test,k)
        
    else:
        #D_train = pre_indexing(X_train,k,alphabet).toarray()
        #D_val = pre_indexing(X_val,k,alphabet).toarray()
        D_train = pre_indexing(X_train,k,alphabet)
        D_val = pre_indexing(X_val,k,alphabet)
        D_test = pre_indexing(X_test,k,alphabet)
        
        
    #K_val = np.inner(D_val, D_train)
    #K_val = K_val.astype('float')
    
    K_train = D_train.dot(D_train.transpose())
    K_train = K_train.toarray().astype('float')
    
    K_val = D_val.dot(D_train.transpose())
    K_val = K_val.toarray().astype('float')
    
    K_test = D_test.dot(D_train.transpose())
    K_test = K_test.toarray().astype('float')
    
        
    return(K_train, K_val, K_test)

# Compute the spectrum-kernels for our data

In [80]:
Xtr0_, Xval0_, ytr0, yval0 = train_test_split(Xtr0, Ytr0, test_size=0.5, random_state=42)
Xtr1_, Xval1_, ytr1, yval1 = train_test_split(Xtr1, Ytr1, test_size=0.5, random_state=42)
Xtr2_, Xval2_, ytr2, yval2 = train_test_split(Xtr2, Ytr2, test_size=0.5, random_state=42)

Xte0 = features_into_array("data/Xte0.csv")
Xte1 = features_into_array("data/Xte1.csv")
Xte2 = features_into_array("data/Xte2.csv")

In [82]:
k = 6
start_time = time.time()
alphabet_8 = all_possible_substrings(k)
print("--- Found alphabet in %s seconds ---" % (time.time() - start_time))
start_time = time.time()

K_tr0, K_val0, K_te0 = spectrum_kernel(Xtr0_, Xval0_, Xte0, k, alphabet=alphabet_8,)
print("--- Computed kernel in %s seconds ---" % (time.time() - start_time))
start_time = time.time()
K_tr1, K_val1, K_te1 = spectrum_kernel(Xtr1_, Xval1_, Xte1, k, alphabet=alphabet_8)
print("--- Computed kernel in %s seconds ---" % (time.time() - start_time))
start_time = time.time()
K_tr2, K_val2, K_te2 = spectrum_kernel(Xtr2_, Xval2_, Xte2, k, alphabet=alphabet_8)
print("--- Computed kernel in %s seconds ---" % (time.time() - start_time))

--- Found alphabet in 0.054116010665893555 seconds ---
--- Computed kernel in 1.334479808807373 seconds ---
--- Computed kernel in 1.1869618892669678 seconds ---
--- Computed kernel in 1.1559290885925293 seconds ---


# Training-Validation
(runs to make sure everything is ok)

In [83]:
lambdas = [0] + [10**i for i in range(-10,2)]

print("************* KRR for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = KRR(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas)
print("")
print("")
print("************* SVM for dataset 1 *************\n")
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = KRR(K_tr1, ytr1[:,1], K_val1, yval1[:,1],lambdas)

print("")
print("")
print("************* SVM for dataset 2 *************\n")
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = KRR(K_tr2, ytr2[:,1], K_val2, yval2[:,1],lambdas)


************* KRR for dataset 0*************

***********lambda = 0***********
Training: loss = 502.1897, accuracy = 1.000000
Validation: loss = 436.9339, accuracy = 0.583000
***********lambda = 1e-10***********
Training: loss = 502.1897, accuracy = 1.000000
Validation: loss = 436.9339, accuracy = 0.583000
***********lambda = 1e-09***********
Training: loss = 502.1897, accuracy = 1.000000
Validation: loss = 436.9339, accuracy = 0.583000
***********lambda = 1e-08***********
Training: loss = 502.1896, accuracy = 1.000000
Validation: loss = 436.9338, accuracy = 0.583000
***********lambda = 1e-07***********
Training: loss = 502.1888, accuracy = 1.000000
Validation: loss = 436.9328, accuracy = 0.583000
***********lambda = 1e-06***********
Training: loss = 502.1803, accuracy = 1.000000
Validation: loss = 436.9220, accuracy = 0.583000
***********lambda = 1e-05***********
Training: loss = 502.0951, accuracy = 1.000000
Validation: loss = 436.8145, accuracy = 0.583000
***********lambda = 0.0001*

In [85]:
lambdas = [10**i for i in range(-4,1)]

print("*************KLR for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = KLR(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas, tresh=1e-8)
print("")
print("")
print("************* SVM for dataset 1 *************\n")
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = KLR(K_tr1, ytr1[:,1], K_val1, yval1[:,1],lambdas, tresh=1e-8)

print("")
print("")
print("************* SVM for dataset 2 *************\n")
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = KLR(K_tr2, ytr2[:,1], K_val2, yval2[:,1],lambdas, tresh= 1e-8)


*************KLR for dataset 0*************

***********lambda = 0.0001***********
Training: loss = 0.3157, accuracy = 1.000000
Validation: loss = 0.6977, accuracy = 0.581000
***********lambda = 0.001***********
Training: loss = 0.3340, accuracy = 1.000000
Validation: loss = 0.6853, accuracy = 0.583000
***********lambda = 0.01***********
Training: loss = 0.4236, accuracy = 0.996000
Validation: loss = 0.6581, accuracy = 0.605000
***********lambda = 0.1***********
Training: loss = 0.5839, accuracy = 0.925000
Validation: loss = 0.6577, accuracy = 0.604000
***********lambda = 1***********
Training: loss = 0.6700, accuracy = 0.788000
Validation: loss = 0.6813, accuracy = 0.569000


************* SVM for dataset 1 *************

***********lambda = 0.0001***********
Training: loss = 0.3156, accuracy = 1.000000
Validation: loss = 0.6967, accuracy = 0.581000
***********lambda = 0.001***********
Training: loss = 0.3338, accuracy = 1.000000
Validation: loss = 0.6831, accuracy = 0.583000
********

In [86]:
lambdas = [10**i for i in range(-4, 3)]
print("************* SVM for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = SVM(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas)
print("")
print("")
print("************* SVM for dataset 1 *************\n")
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = SVM(K_tr1, ytr1[:,1], K_val1, yval1[:,1],lambdas)

print("")
print("")
print("************* SVM for dataset 2 *************\n")
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = SVM(K_tr2, ytr2[:,1], K_val2, yval2[:,1],lambdas)


************* SVM for dataset 0*************

---------------  lambda = 0.0001  ---------------
Training: loss = 0.000000, accuracy = 1.000000
Validation: loss = 0.893657, accuracy = 0.601000
---------------  lambda = 0.001  ---------------
Training: loss = 0.000000, accuracy = 1.000000
Validation: loss = 0.893657, accuracy = 0.601000
---------------  lambda = 0.01  ---------------
Training: loss = 0.011567, accuracy = 0.996000
Validation: loss = 0.882224, accuracy = 0.594000
---------------  lambda = 0.1  ---------------
Training: loss = 0.453866, accuracy = 0.907000
Validation: loss = 0.830064, accuracy = 0.615000
---------------  lambda = 1  ---------------
Training: loss = 0.878176, accuracy = 0.622000
Validation: loss = 0.931827, accuracy = 0.520000
---------------  lambda = 10  ---------------
Training: loss = 0.987214, accuracy = 0.613000
Validation: loss = 0.992732, accuracy = 0.521000
---------------  lambda = 100  ---------------
Training: loss = 0.998721, accuracy = 0.612000

# Save the best model

In [None]:
test_kernels = [K_te0, K_te1, K_te2]
#test_alphas = [alphas_tr0[-4], alphas_tr1[-4], alphas_tr2[-3]] # il faut choisir l'alpha associé à un bon lambda!
test_alphas = [alphas_tr0[0], alphas_tr1[0], alphas_tr2[0]]
write_predictions_csv(test_kernels, test_alphas, path ="data/Ytest_sequences.csv", mode="SVM")