In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm
import numba
from numba import jit

In [2]:
df = pd.read_csv('kernel-methods-for-machine-learning-2018-2019/Xtr0.csv')

In [3]:
df.head()

Unnamed: 0,Id,seq
0,0,GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGA...
1,1,ACCCTGCCTACACCGCGGCGGGGACAGGTGGAGGTTTCAACCCCTG...
2,2,TGCAAATCTGTAAGCATTTCTCAGGCAATGAATTATGTCAACACAA...
3,3,GCGGGACGTGGGCGTCGAGGGTAAGGATATCTGCAGAAGTACTGTC...
4,4,GGAGAATAGCATGTATCCGAGAGGTGGAGCTGGCAGTGAGCCGAGA...


All sequences is this set have length 101.

We have two parameters $k$ and $\lambda$.

Let $u \in A^k$. If $i=(i_1, ..., i_k)$, let $x(i)=x_{i_1}...x_{i_k}$. We define :
$\Phi_u(x) = \sum_{i, x(i) = u} \lambda^{i_k - i_1 + 1}$

In [4]:
k = 3
lambd = 0.7
letters = ["A", "C", "G", "T"]
index = np.arange(85)
for i in range(4):
    index[ord(letters[i])] = i

The dot product is computed recursively using auxiliary quantities defined p. 353 and 356.

In [5]:
def one_hot(s):
    """
    Input : string
    Output : n*4 0/1 matrix
    """
    result = np.zeros((len(s), 4))
    array = np.array(list(s))
    for i in range(4):
        result[:, i] = (array == letters[i])
    return result

Pre-computes the one-hot encodings of the DNA sequences :

In [6]:
one_hots = np.zeros((df.shape[0], len(df["seq"][0]), 4))
for i in range(df.shape[0]):
    one_hots[i] = one_hot(df["seq"][i])

df_int = one_hots.argmax(axis=2) # letters replaced with integers

Auxiliary function $B$ :

In [7]:
@jit(nopython=True)
def compute_B(x, y, k, lambd):
    """
    Input : x, y are strings
    Returns a n*n*k tensor B such that B[l,i,j] = B_l(x[0:i], y[0:j])
    """
    n = len(x)
    B = np.zeros((k, n, n))
    B[0, :, :] = 1
    for l in range(1, k):
        if l > n:
            break
        for i in range(l, n):
            for j in range(l, n):
                a = x[i]
                b = y[j]
                B[l, i, j] = lambd*(B[l, i-1, j] + B[l, i, j-1]) - lambd**2 * B[l, i-1, j-1]
                if a == b:
                    B[l, i, j] += lambd**2 * B[l-1, i-1, j-1]
    return B

In [9]:
%%timeit
_ = compute_B(df["seq"][0], df["seq"][1], k, lambd)

4.71 ms ± 111 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
@jit(nopython=True)
def compute_K(df_int, one_hots, i, j, k, lambd):
    """
    df_int is the list of integer sequences
    Returns a n*n*k tensor K such that K[l,i,j] = K_l(x[0:i], y[0:j])
    """
    x = df_int[i]
    y = df_int[j]
    xo = one_hots[i]
    yo = one_hots[j]
    
    n = df_int.shape[1]
    B = compute_B(x, y, k, lambd)
    K = np.zeros((k, n, n))
    K[0, :, :] = 1
    
    for l in range(1, k):
        if l > n:
            break
        for i in range(l, n):
            for j in range(l, n):
                a = x[i]
                K[l, i, j] = K[l, i-1, j]
                mask = yo[:j, a]
                K[l, i, j] += lambd**2 * mask.dot(B[l-1, i-1, :j])
    return K

In [13]:
%%timeit
_ = compute_K(df_int, one_hots, 0, 1, k, lambd)

3.76 ms ± 77.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
@jit(nopython=True)
def kernel(df_int, one_hots, i, j, k, lambd):
    K = compute_K(df_int, one_hots, i, j, k, lambd)
    return K[-1, -1, -1]

In [15]:
K = np.zeros((2000, 2000))

In [16]:
for i in tqdm(range(df.shape[0])):
    K[i, 0] = kernel(df_int, one_hots, i, 0, k, lambd)

100%|██████████| 2000/2000 [00:08<00:00, 237.27it/s]


In [92]:
@jit(nopython=True, parallel=True)
def loop():
    K = np.zeros((2000, 2000))
    for i in range(2000):
        K[i, 0] = kernel(df_int, one_hots, i, 0, k, lambd)

In [95]:
%%time
loop()

CPU times: user 6.94 s, sys: 7.88 ms, total: 6.95 s
Wall time: 6.92 s
