In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from itertools import product
import functools 
import operator 
import regex as re

from classifiers import *
from metrics import *
from kernels import *

from sklearn.model_selection import train_test_split # lui il va partir mais pour l'instant c'est pratique


# Load the data

In [3]:
from csv import reader

def features_into_array(path):
    with open(path, 'r') as read_obj:
        csv_reader = reader(read_obj)
        header = next(csv_reader)
        X = list()
        if header != None:
            for row in csv_reader:
                # row variable is a list that represents a row in csv
                X.append(np.array(row[1]))
                
    X = np.array(X) ## dtype might be changed in something more convenient. For now, dtype = "<U1"
    return X

In [4]:
Xtr0 = features_into_array("data/Xtr0.csv")
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

Xtr1 = features_into_array("data/Xtr1.csv")
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

Xtr2 = features_into_array("data/Xtr2.csv")
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)

In [5]:
Xtr0[0]

'TCCTGTGCACATCTGCACCCCTGTTGTGGCCACAAAATGATCCGGCACCACCCAGTGGGAGACGACAGAGGTGGCAATGGGGTGTCGGCTCTGACGCCTCC'

## Spectrum kernel

For a fixed value k (that needs to be tuned), the k-spectrum kernel is defined as : 


\begin{align*}
K(x,x^{\prime}) := \sum_{u \in \mathcal{A}^k} \phi_{u}(x) \phi_{u}(x^{\prime})
\end{align*}

In [6]:
def all_possible_substrings(k):
    """
    With a k spectrum kernel, let us find all the possible combinations of chars of size k in the sequence x
    This way, we could index them in the sequence x
    """
    char_list = list(['A', 'C','G','T'])
    alphabet_tuples = list(product(char_list,repeat=k))
    alphabet = list()
    for i in alphabet_tuples:
        alphabet.append(functools.reduce(operator.add, (i)))
    return alphabet

In [7]:
## example

all_possible_substrings(3)

['AAA',
 'AAC',
 'AAG',
 'AAT',
 'ACA',
 'ACC',
 'ACG',
 'ACT',
 'AGA',
 'AGC',
 'AGG',
 'AGT',
 'ATA',
 'ATC',
 'ATG',
 'ATT',
 'CAA',
 'CAC',
 'CAG',
 'CAT',
 'CCA',
 'CCC',
 'CCG',
 'CCT',
 'CGA',
 'CGC',
 'CGG',
 'CGT',
 'CTA',
 'CTC',
 'CTG',
 'CTT',
 'GAA',
 'GAC',
 'GAG',
 'GAT',
 'GCA',
 'GCC',
 'GCG',
 'GCT',
 'GGA',
 'GGC',
 'GGG',
 'GGT',
 'GTA',
 'GTC',
 'GTG',
 'GTT',
 'TAA',
 'TAC',
 'TAG',
 'TAT',
 'TCA',
 'TCC',
 'TCG',
 'TCT',
 'TGA',
 'TGC',
 'TGG',
 'TGT',
 'TTA',
 'TTC',
 'TTG',
 'TTT']

In [8]:
## TODO : a function that computes occurences 
## with overlapping option without calling regex if we have remaining time (lol)

def pre_indexing_by_sequence(x, k):
    alphabet = all_possible_substrings(k)
    return dict((letter, len(re.findall(letter, x, overlapped=True))) for letter in alphabet)

In [9]:
def pre_indexing(X, k):
    """
    Transforms an input array into a sparse matrix encoding the number of occurences of each letter of
    the alphabet composed of substrings of size k
    """
    i = 0
    n = X.shape[0]
    alphabet = all_possible_substrings(k)
    D = np.zeros((n,len(alphabet)))
    for x in X:
        d = dict((letter, len(re.findall(letter, x, overlapped=True))) 
                             for letter in alphabet)
        data = np.array(list(d.items()))
        D[i] = data[:,1]
        i+=1
    D = csr_matrix(D, dtype = int)
    return D

In [10]:
## example
print(pre_indexing(Xtr0,3).toarray().shape)


(2000, 64)


In [26]:
#def spectrum_function(x,y,k):
#    phi_x = pre_indexing(x, k)
#    phi_y = pre_indexing(y, k)
#    
#    merge_dict = {k: phi_x.get(k, 0) * phi_y.get(k, 0) for k in set(phi_x)}
#    return sum(merge_dict.values())


## TODO ##
def spectrum_kernel(X_train, X_val, k, mode="train"):
    """
    Computes the spectrum kernels for X_train (n_train x n_train) and X_validation (on the RKHS generated(?) by
    X_train's samples) which is of shape n_validation x n_train
    "test" mode only gives as output the testing kernel
    """
    
    D_train = pre_indexing(X_train,k).toarray()
    D_val = pre_indexing(X_val,k).toarray()
    
    K_val = np.inner(D_val, D_train)
    K_val = K_val.astype('float')
    if mode == "test":
        return(K_val)
    else:
        K_train = np.inner(D_train, D_train)
        K_train = K_train.astype('float')
        
        return(K_train, K_val)

In [27]:
Xtr0_, Xval0_, ytr0, yval0 = train_test_split(Xtr0, Ytr0, test_size=0.2, random_state=42)

In [30]:
K_tr0, K_val0 = spectrum_kernel(Xtr0_, Xval0_, 3, mode="train")

In [32]:
lambdas = [0] + [10**i for i in range(-10,2)]
print("************* KRR for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = KRR(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas)


************* KRR for dataset 0*************

***********lambda = 0***********
Training: loss = 13589139989960126.0000, accuracy = 0.489375
Validation: loss = 1308657.5175, accuracy = 0.492500
***********lambda = 1e-10***********
Training: loss = 1515.9968, accuracy = 0.622500
Validation: loss = 108.1055, accuracy = 0.585000
***********lambda = 1e-09***********
Training: loss = 424.2567, accuracy = 0.622500
Validation: loss = 108.1055, accuracy = 0.585000
***********lambda = 1e-08***********
Training: loss = 431.0139, accuracy = 0.622500
Validation: loss = 108.1056, accuracy = 0.585000
***********lambda = 1e-07***********
Training: loss = 430.9448, accuracy = 0.622500
Validation: loss = 108.1055, accuracy = 0.585000
***********lambda = 1e-06***********
Training: loss = 430.9434, accuracy = 0.622500
Validation: loss = 108.1055, accuracy = 0.585000
***********lambda = 1e-05***********
Training: loss = 430.9396, accuracy = 0.622500
Validation: loss = 108.1045, accuracy = 0.582500
********

In [33]:
lambdas = [10**i for i in range(-4,1)]

print("*************KLR for dataset 0*************\n")
alphas_tr0_klr, loss_tr0_klr, acc_0_klr, loss_val0_klr, acc_val0_klr = KLR(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas, tresh=1e-8)


*************KLR for dataset 0*************

***********lambda = 0.0001***********
Training: loss = 0.6624, accuracy = 0.623125
Validation: loss = 0.6710, accuracy = 0.582500
***********lambda = 0.001***********
Training: loss = 0.6627, accuracy = 0.621875
Validation: loss = 0.6705, accuracy = 0.592500
***********lambda = 0.01***********
Training: loss = 0.6641, accuracy = 0.615000
Validation: loss = 0.6700, accuracy = 0.592500
***********lambda = 0.1***********
Training: loss = 0.6677, accuracy = 0.610000
Validation: loss = 0.6723, accuracy = 0.590000
***********lambda = 1***********
Training: loss = 0.6797, accuracy = 0.598750
Validation: loss = 0.6829, accuracy = 0.580000


In [37]:
lambdas = [10**i for i in range(-10, 0)]
print("************* SVM for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = SVM(Ktr0_, ytr0[:,1], Kval0_, yval0[:,1], lambdas)


************* SVM for dataset 0*************

---------------  lambda = 1e-10  ---------------
Training: loss = 0.826925, accuracy = 0.616875
Validation: loss = 0.873604, accuracy = 0.590000


KeyboardInterrupt: 

# Testing the accuracy on sequences

In [None]:
Xte0_seq = features_into_array("data/Xte0.csv")
Xte1_seq = features_into_array("data/Xte1.csv")
Xte2_seq = features_into_array("data/Xte2.csv")

In [None]:
test_kernels = [K_te0, K_te1, K_te2]
#test_alphas = [alphas_tr0[-4], alphas_tr1[-4], alphas_tr2[-3]] # il faut choisir l'alpha associé à un bon lambda!
test_alphas = [alphas_tr0[0], alphas_tr1[0], alphas_tr2[0]]
write_predictions_csv(test_kernels, test_alphas, path ="data/Ytest_sequences.csv", mode="SVM")