In [12]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from itertools import product, chain
import functools 
import operator 
from csv import reader
import regex as re

from classifiers import *
from metrics import *
from kernels import *

from sklearn.model_selection import train_test_split # lui il va partir mais pour l'instant c'est pratique

# Load the data

In [2]:
def features_into_array(path):
    with open(path, 'r') as read_obj:
        csv_reader = reader(read_obj)
        header = next(csv_reader)
        X = list()
        if header != None:
            for row in csv_reader:
                # row variable is a list that represents a row in csv
                X.append(np.array(row[1]))
                
    X = np.array(X) ## dtype might be changed in something more convenient. For now, dtype = "<U1"
    return X

In [3]:
Xtr0 = features_into_array("data/Xtr0.csv")
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

Xtr1 = features_into_array("data/Xtr1.csv")
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

Xtr2 = features_into_array("data/Xtr2.csv")
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)

In [4]:
def accuracy(y_true,y_pred, mode='SVM'):
    n = y_true.shape[0]
    if mode == 'SVM':
        predictions = np.ones(n)
        predictions[y_pred < 0] = 0
    else:
        predictions = np.zeros(n)
        predictions[y_pred >= 0.5] = 1
    
    return np.sum(y_true == predictions) / n

In [5]:
Xtr0[0]

'TCCTGTGCACATCTGCACCCCTGTTGTGGCCACAAAATGATCCGGCACCACCCAGTGGGAGACGACAGAGGTGGCAATGGGGTGTCGGCTCTGACGCCTCC'

## Mismatch Spectrum kernel

For a fixed value k (that needs to be tuned), the k-spectrum kernel is defined as : 


\begin{align*}
K(x,x^{\prime}) := \sum_{u \in \mathcal{A}^k} \phi_{u}(x) \phi_{u}(x^{\prime})
\end{align*}

We relax this constraint by authorizing each word of the alphabet to have up to m mismatches.

In [6]:
def neighbors(word, m):
    """
    This gives neighbors that differ in exactly m places
    """
    
    char_list = list(['A', 'C','G','T'])
    assert(m <= len(word))

    if m == 0:
        return [word]

    r2 = neighbors(word[1:], m-1)
    r = [c + r3 for r3 in r2 for c in char_list if c != word[0]]

    if (m < len(word)):
        r2 = neighbors(word[1:], m)
        r += [word[0] + r3 for r3 in r2]

    return r

def neighbors2(pattern, m):
    """
    This gives neighbors that differ in at most m places.
    """
    return sum([neighbors(pattern, d2) for d2 in range(m + 1)], [])


In [7]:
def all_possible_substrings_mismatch(k,m):
    """
    With a k spectrum kernel, let us find all the possible combinations of chars of size k in the sequence x
    This way, we could index them in the sequence x
    """
    char_list = list(['A', 'C','G','T'])
    alphabet_tuples = list(product(char_list,repeat=k))
    alphabet = list()
    for i in alphabet_tuples:
        word = functools.reduce(operator.add, (i))
        l= [word]+neighbors2(word,m)[1:]
        alphabet.append(l)
    return alphabet

In [8]:
def pre_indexing_mismatch(X, alphabet):
    """
    Transforms an input array into a sparse matrix encoding the number of occurences of each letter of
    the alphabet composed of substrings of size k
    """
    i = 0
    n = X.shape[0]
    D = np.zeros((n,len(alphabet)))
    for x in X:
        d = dict()
        for letters in alphabet :
            cnt = 0
            for letter in letters:
                cnt += len(re.findall(letter, x, overlapped=True))
            d[letters[0]] = cnt
        data = np.array(list(d.items()))
        D[i] = data[:,1]
        i+=1
    D = csr_matrix(D, dtype = int)
    return D

In [9]:
alphabet = all_possible_substrings_mismatch(3,1)
pre_indexing_mismatch(Xtr0[:5],alphabet).toarray()

array([[11, 17, 13,  7, 19, 18, 13, 17, 13, 16, 17, 12, 11, 10, 17,  8,
        16, 19, 19, 16, 20, 26, 21, 14, 14, 20, 19, 15, 12, 18, 19, 11,
        15, 18, 16, 13, 14, 24, 21, 16, 19, 19, 24, 18, 12, 20, 19, 11,
         6, 13, 10, 10, 18, 16, 14, 14, 16, 20, 20, 17,  3, 11, 17,  8],
       [20, 19, 10, 19, 17, 15,  8, 20, 11, 16,  9, 14, 22, 18, 17, 26,
        18, 12, 12, 19, 18, 19, 11, 15, 12,  6,  2, 13, 19, 18, 12, 25,
        10,  9, 11, 14,  9,  9,  3, 15,  5,  2,  2, 13, 18, 16, 10, 19,
        20, 20, 17, 31, 18, 21, 12, 27, 16, 12,  9, 23, 28, 25, 20, 34],
       [36, 24, 26, 26, 24, 15, 10, 19, 27, 14, 16, 17, 26, 15, 14, 18,
        25, 17, 17, 17, 18,  9,  6,  9, 10,  9,  9, 10, 18,  7, 12, 14,
        24, 13, 15, 13, 21,  7, 12, 11, 12, 14, 14, 13, 17, 13, 12, 14,
        28, 14, 16, 16, 14, 11, 10, 11, 16, 13, 13, 12, 17, 12, 17, 11],
       [32, 22, 22, 22, 20, 12, 12, 16, 23, 16, 18, 17, 19, 10, 18, 18,
        24, 14, 16, 13, 15, 13, 11,  8, 16,  7, 13,  9, 12, 1

In [10]:
def mismatch_spectrum_kernel(X_train, X_val, k, mode="train", m=1):
    """
    Computes the spectrum kernels for X_train (n_train x n_train) and X_validation (on the RKHS generated(?) by
    X_train's samples) which is of shape n_validation x n_train
    "test" mode only gives as output the testing kernel
    """
    alphabet = all_possible_substrings_mismatch(k,m)
    
    D_train = pre_indexing_mismatch(X_train,alphabet).toarray()
    D_val = pre_indexing_mismatch(X_val,alphabet).toarray()
    
    K_val = np.inner(D_val, D_train)
    K_val = K_val.astype('float')
    if mode == "test":
        return(K_val)
    else:
        K_train = np.inner(D_train, D_train)
        K_train = K_train.astype('float')
        
        return(K_train, K_val)


In [11]:
## example 

mismatch_spectrum_kernel(Xtr0[:5], Xtr0[16:20], 3,mode='test',m=1)

array([[16158., 14442., 15070., 15164., 15408.],
       [15926., 13732., 15764., 15894., 15986.],
       [15814., 15190., 15012., 14900., 15256.],
       [14978., 16138., 15872., 15384., 15206.]])

## Application on data

In [22]:
Xtr_0, Xval_0, ytr0, yval0 = train_test_split(Xtr0, Ytr0, test_size=0.2, random_state=42)
Xtr_1, Xval_1, ytr1, yval1 = train_test_split(Xtr1, Ytr1, test_size=0.2, random_state=42)
Xtr_2, Xval_2, ytr2, yval2 = train_test_split(Xtr2, Ytr2, test_size=0.2, random_state=42)

In [23]:
K_tr0, K_val0 = mismatch_spectrum_kernel(Xtr_0, Xval_0, 3, mode="train",m=1)
K_tr1, K_val1 = mismatch_spectrum_kernel(Xtr_1, Xval_1, 3, mode="train",m=1)
K_tr2, K_val2 = mismatch_spectrum_kernel(Xtr_2, Xval_2, 3, mode="train",m=1)

### Kernel Ridge Regression

In [26]:
lambdas = [0] + [10**i for i in range(-10,2)]
print(20 * "-"+ " KRR for dataset 0 " + 20 * "-")
print('')
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = KRR(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas)
print('')

print(20 * "-"+ " KRR for dataset 1 " + 20 * "-")
print('')
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = KRR(K_tr1, ytr1[:,1], K_val1, yval1[:,1], lambdas)
print('')

print(20 * "-"+ " KRR for dataset 2 " + 20 * "-")
print('')
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = KRR(K_tr2, ytr2[:,1], K_val2, yval2[:,1], lambdas)
print('')


-------------------- KRR for dataset 0 --------------------

***********lambda = 0***********
Training: loss = -238825242745066.0312, accuracy = 0.509375
Validation: loss = 2461925.3800, accuracy = 0.482500
***********lambda = 1e-10***********
Training: loss = -157179.9545, accuracy = 0.621250
Validation: loss = 108.1015, accuracy = 0.582500
***********lambda = 1e-09***********
Training: loss = -328.0022, accuracy = 0.622500
Validation: loss = 108.1069, accuracy = 0.585000
***********lambda = 1e-08***********
Training: loss = 431.6183, accuracy = 0.622500
Validation: loss = 108.1057, accuracy = 0.585000
***********lambda = 1e-07***********
Training: loss = 430.8024, accuracy = 0.622500
Validation: loss = 108.1056, accuracy = 0.585000
***********lambda = 1e-06***********
Training: loss = 430.8338, accuracy = 0.622500
Validation: loss = 108.1055, accuracy = 0.585000
***********lambda = 1e-05***********
Training: loss = 430.8331, accuracy = 0.622500
Validation: loss = 108.1054, accuracy =

## Support Vector Machine

In [27]:
lambdas = [10**i for i in range(-10, 2)]

print(20 * "-"+ " SVM for dataset 0 " + 20 * "-")
print('')
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = SVM(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas)
print('')

print(20 * "-"+ " SVM for dataset 1 " + 20 * "-")
print('')
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = SVM(K_tr1, ytr1[:,1], K_val1, yval1[:,1], lambdas)
print('')

print(20 * "-"+ " SVM for dataset 2 " + 20 * "-")
print('')
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = SVM(K_tr2, ytr2[:,1], K_val2, yval2[:,1], lambdas)
print('')

-------------------- SVM for dataset 0 --------------------

---------------  lambda = 1e-10  ---------------
Training: loss = 0.826924, accuracy = 0.616875
Validation: loss = 0.873618, accuracy = 0.590000
---------------  lambda = 1e-09  ---------------
Training: loss = 0.826926, accuracy = 0.616875
Validation: loss = 0.873605, accuracy = 0.590000
---------------  lambda = 1e-08  ---------------
Training: loss = 0.826925, accuracy = 0.616875
Validation: loss = 0.873604, accuracy = 0.590000
---------------  lambda = 1e-07  ---------------
Training: loss = 0.826925, accuracy = 0.616875
Validation: loss = 0.873604, accuracy = 0.590000
---------------  lambda = 1e-06  ---------------
Training: loss = 0.826925, accuracy = 0.617500
Validation: loss = 0.873569, accuracy = 0.590000
---------------  lambda = 1e-05  ---------------
Training: loss = 0.826925, accuracy = 0.617500
Validation: loss = 0.873547, accuracy = 0.590000
---------------  lambda = 0.0001  ---------------
Training: loss = 0.

# Testing the accuracy on sequences

In [30]:
Xte0_seq = features_into_array("data/Xte0.csv")
Xte1_seq = features_into_array("data/Xte1.csv")
Xte2_seq = features_into_array("data/Xte2.csv")

K_te0 = mismatch_spectrum_kernel(Xtr_0, Xte0_seq, 3, mode="test",m=1)
K_te1 = mismatch_spectrum_kernel(Xtr_1, Xte1_seq, 3, mode="test",m=1)
K_te2 = mismatch_spectrum_kernel(Xtr_2, Xte2_seq, 3, mode="test",m=1)

In [31]:
test_kernels = [K_te0, K_te1, K_te2]
#test_alphas = [alphas_tr0[-4], alphas_tr1[-4], alphas_tr2[-3]] # il faut choisir l'alpha associé à un bon lambda!
test_alphas = [alphas_tr0[9], alphas_tr1[4], alphas_tr2[8]]
write_predictions_csv(test_kernels, test_alphas, path ="data/mismatch_Ytest_sequences.csv", mode="SVM")

saving predictions
saved predictions
