In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from itertools import product
import functools 
import operator 

# Load the data

In [58]:
from csv import reader

def features_into_array(path):
    with open(path, 'r') as read_obj:
        csv_reader = reader(read_obj)
        header = next(csv_reader)
        X = list()
        if header != None:
            for row in csv_reader:
                # row variable is a list that represents a row in csv
                X.append(np.array(row[1]))
                
    X = np.array(X) ## dtype might be changed in something more convenient. For now, dtype = "<U1"
    return X

In [59]:
Xtr0 = features_into_array("data/Xtr0.csv")
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

Xtr1 = features_into_array("data/Xtr1.csv")
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

Xtr2 = features_into_array("data/Xtr2.csv")
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)

In [142]:
def accuracy(y_true,y_pred, mode='SVM'):
    n = y_true.shape[0]
    if mode == 'SVM':
        predictions = np.ones(n)
        predictions[y_pred < 0] = 0
    else:
        predictions = np.zeros(n)
        predictions[y_pred >= 0.5] = 1
    
    return np.sum(y_true == predictions) / n

In [62]:
Xtr0[0]

'TCCTGTGCACATCTGCACCCCTGTTGTGGCCACAAAATGATCCGGCACCACCCAGTGGGAGACGACAGAGGTGGCAATGGGGTGTCGGCTCTGACGCCTCC'

## Spectrum kernel

For a fixed value k (that needs to be tuned), the k-spectrum kernel is defined as : 


\begin{align*}
K(x,x^{\prime}) := \sum_{u \in \mathcal{A}^k} \phi_{u}(x) \phi_{u}(x^{\prime})
\end{align*}

In [94]:
def all_possible_substrings(k):
    """
    With a k spectrum kernel, let us find all the possible combinations of chars of size k in the sequence x
    This way, we could index them in the sequence x
    """
    char_list = list(['A', 'C','G','T'])
    alphabet_tuples = list(product(char_list,repeat=k))
    alphabet = list()
    for i in alphabet_tuples:
        alphabet.append(functools.reduce(operator.add, (i)))
    return alphabet

In [95]:
## example

all_possible_substrings(3)

['AAA',
 'AAC',
 'AAG',
 'AAT',
 'ACA',
 'ACC',
 'ACG',
 'ACT',
 'AGA',
 'AGC',
 'AGG',
 'AGT',
 'ATA',
 'ATC',
 'ATG',
 'ATT',
 'CAA',
 'CAC',
 'CAG',
 'CAT',
 'CCA',
 'CCC',
 'CCG',
 'CCT',
 'CGA',
 'CGC',
 'CGG',
 'CGT',
 'CTA',
 'CTC',
 'CTG',
 'CTT',
 'GAA',
 'GAC',
 'GAG',
 'GAT',
 'GCA',
 'GCC',
 'GCG',
 'GCT',
 'GGA',
 'GGC',
 'GGG',
 'GGT',
 'GTA',
 'GTC',
 'GTG',
 'GTT',
 'TAA',
 'TAC',
 'TAG',
 'TAT',
 'TCA',
 'TCC',
 'TCG',
 'TCT',
 'TGA',
 'TGC',
 'TGG',
 'TGT',
 'TTA',
 'TTC',
 'TTG',
 'TTT']

In [158]:
import regex as re

## TODO : a function that computes occurences 
## with overlapping option without calling regex if we have remaining time (lol)

def pre_indexing_by_sequence(x, k):
    alphabet = all_possible_substrings(k)
    return dict((letter, len(re.findall(letter, x, overlapped=True))) for letter in alphabet)

In [183]:
import scipy.sparse as sp
from scipy.sparse import csr_matrix

def pre_indexing(X, k):
    """
    Transforms an input array into a sparse matrix encoding the number of occurences of each letter of
    the alphabet composed of substrings of size k
    """
    i = 0
    n = X.shape[0]
    alphabet = all_possible_substrings(k)
    D = np.zeros((n,len(alphabet)))
    for x in X:
        d = dict((letter, len(re.findall(letter, x, overlapped=True))) 
                             for letter in alphabet)
        data = np.array(list(d.items()))
        D[i] = data[:,1]
        i+=1
    D = csr_matrix(D, dtype = int)
    return D

In [188]:
## example

Xtr0[:5]
print(pre_indexing(Xtr0[:5],2).toarray())


16
[[ 4  8  4  4 10 11  4  6  6  7 10  7  0  6 12  1]
 [ 7  5  6  9  6  9  0  8  3  1  0  7 11  8  5 15]
 [18  6  8  6 10  3  0  5  3  6  6  6  8  3  6  6]
 [16  5  8  4  8  6  1  3  7  3  8  7  3  3  8 10]
 [11  4 11  4  6  5  1  9  8  6  9  4  4  6  7  5]]


In [143]:
#def spectrum_function(x,y,k):
#    phi_x = pre_indexing(x, k)
#    phi_y = pre_indexing(y, k)
#    
#    merge_dict = {k: phi_x.get(k, 0) * phi_y.get(k, 0) for k in set(phi_x)}
#    return sum(merge_dict.values())


## TODO ##
def spectrum_kernel(X_train, X_val, k, mode="train"):
    n_train = X_train.shape[0]
    n_val = X_val.shape[0]
    
    diag_train, diag_val = np.zeros(n_train), np.zeros(n_val)
    
    for i in range(n_train):
        diag_train[i] = spectrum_function(X_train[i], X_train[i],k)
        
    for i in range(n_val):
        diag_val[i] = spectrum_function(X_train[i], X_val[i],k)
        
    K_train = diag_train * np.eye(n_train) # Computation along the diagonal 
    K_val = diag_val * np.eye(n_val) # Computation along the diagonal 
    
    if mode=="test":
        for i in range(n_val):
            for j in range(n_train):
                val = spectrum_function(X_val[i], X_train[j], k)
                K_val[i,j] = val
        return(K_val)
    
    else:
        for i in range(n_train):
            for j in range(i+1,n_train):
                val = spectrum_function(X_train[i], X_train[j], k)
                K_train[i,j] = val
                K_train[j,i] = val
        return(K_train)

In [130]:
## example

x = pre_indexing_by_sequence(Xtr0[0], 3)
print("x ", x)
print("")
y = pre_indexing_by_sequence(Xtr0[1], 3)
print('y ', y)

print('')
print(({k: x.get(k, 0) * y.get(k, 0) for k in x}.values()))


x  {'AAA': 2, 'AAC': 0, 'AAG': 0, 'AAT': 2, 'ACA': 3, 'ACC': 3, 'ACG': 2, 'ACT': 0, 'AGA': 2, 'AGC': 0, 'AGG': 1, 'AGT': 1, 'ATA': 0, 'ATC': 2, 'ATG': 2, 'ATT': 0, 'CAA': 2, 'CAC': 5, 'CAG': 2, 'CAT': 1, 'CCA': 3, 'CCC': 3, 'CCG': 1, 'CCT': 3, 'CGA': 1, 'CGC': 1, 'CGG': 2, 'CGT': 0, 'CTA': 0, 'CTC': 2, 'CTG': 4, 'CTT': 0, 'GAA': 0, 'GAC': 3, 'GAG': 2, 'GAT': 1, 'GCA': 4, 'GCC': 2, 'GCG': 0, 'GCT': 1, 'GGA': 1, 'GGC': 4, 'GGG': 3, 'GGT': 2, 'GTA': 0, 'GTC': 1, 'GTG': 5, 'GTT': 1, 'TAA': 0, 'TAC': 0, 'TAG': 0, 'TAT': 0, 'TCA': 0, 'TCC': 3, 'TCG': 1, 'TCT': 2, 'TGA': 2, 'TGC': 2, 'TGG': 4, 'TGT': 4, 'TTA': 0, 'TTC': 0, 'TTG': 1, 'TTT': 0}

y  {'AAA': 1, 'AAC': 1, 'AAG': 3, 'AAT': 2, 'ACA': 1, 'ACC': 4, 'ACG': 0, 'ACT': 0, 'AGA': 3, 'AGC': 0, 'AGG': 0, 'AGT': 3, 'ATA': 3, 'ATC': 4, 'ATG': 0, 'ATT': 2, 'CAA': 2, 'CAC': 2, 'CAG': 1, 'CAT': 1, 'CCA': 3, 'CCC': 2, 'CCG': 0, 'CCT': 4, 'CGA': 0, 'CGC': 0, 'CGG': 0, 'CGT': 0, 'CTA': 4, 'CTC': 1, 'CTG': 1, 'CTT': 1, 'GAA': 1, 'GAC': 0, 'GAG': 0, '