In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import tqdm

In [48]:
df = pd.read_csv('data/Xtr0.csv')

In [49]:
df.head()

Unnamed: 0,Id,seq
0,0,GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGA...
1,1,ACCCTGCCTACACCGCGGCGGGGACAGGTGGAGGTTTCAACCCCTG...
2,2,TGCAAATCTGTAAGCATTTCTCAGGCAATGAATTATGTCAACACAA...
3,3,GCGGGACGTGGGCGTCGAGGGTAAGGATATCTGCAGAAGTACTGTC...
4,4,GGAGAATAGCATGTATCCGAGAGGTGGAGCTGGCAGTGAGCCGAGA...


All sequences is this set have length 101.

In [50]:
letters = 'ATCG'

In [51]:
length = 3

In [52]:
def build_voc(letters, length):
    vocl = [''.join(x) for x in itertools.product(letters, repeat=length)]
    voc = {}
    i = 0
    for v in vocl:
        voc[v] = i
        i+=1
    return voc
voc = build_voc(letters, length)
print(voc)

{'AAA': 0, 'AAT': 1, 'AAC': 2, 'AAG': 3, 'ATA': 4, 'ATT': 5, 'ATC': 6, 'ATG': 7, 'ACA': 8, 'ACT': 9, 'ACC': 10, 'ACG': 11, 'AGA': 12, 'AGT': 13, 'AGC': 14, 'AGG': 15, 'TAA': 16, 'TAT': 17, 'TAC': 18, 'TAG': 19, 'TTA': 20, 'TTT': 21, 'TTC': 22, 'TTG': 23, 'TCA': 24, 'TCT': 25, 'TCC': 26, 'TCG': 27, 'TGA': 28, 'TGT': 29, 'TGC': 30, 'TGG': 31, 'CAA': 32, 'CAT': 33, 'CAC': 34, 'CAG': 35, 'CTA': 36, 'CTT': 37, 'CTC': 38, 'CTG': 39, 'CCA': 40, 'CCT': 41, 'CCC': 42, 'CCG': 43, 'CGA': 44, 'CGT': 45, 'CGC': 46, 'CGG': 47, 'GAA': 48, 'GAT': 49, 'GAC': 50, 'GAG': 51, 'GTA': 52, 'GTT': 53, 'GTC': 54, 'GTG': 55, 'GCA': 56, 'GCT': 57, 'GCC': 58, 'GCG': 59, 'GGA': 60, 'GGT': 61, 'GGC': 62, 'GGG': 63}


$\Phi_u(x)$ is the number of occurrences of u in x (without gaps) : *spectrum kernel* (Leslie et al., 2002)

In [53]:
def substrings(x, length):
    n = len(x)
    sub = []
    assert n>=length, 'seq too small'
    for i in range(n-length+1):
        curr = x[i:i+length]
        sub.append(curr)
    return sub

In [54]:
x = df['seq'][0]
sub = substrings(x, 3)
print(sub)

['GGA', 'GAG', 'AGA', 'GAA', 'AAT', 'ATC', 'TCA', 'CAT', 'ATT', 'TTT', 'TTG', 'TGA', 'GAA', 'AAC', 'ACC', 'CCC', 'CCG', 'CGG', 'GGG', 'GGA', 'GAG', 'AGG', 'GGT', 'GTG', 'TGG', 'GGA', 'GAG', 'AGG', 'GGT', 'GTT', 'TTG', 'TGC', 'GCC', 'CCG', 'CGT', 'GTG', 'TGA', 'GAG', 'AGC', 'GCT', 'CTG', 'TGA', 'GAG', 'AGA', 'GAT', 'ATT', 'TTG', 'TGC', 'GCG', 'CGC', 'GCC', 'CCA', 'CAT', 'ATT', 'TTG', 'TGC', 'GCA', 'CAC', 'ACT', 'CTC', 'TCC', 'CCA', 'CAG', 'AGC', 'GCC', 'CCT', 'CTG', 'TGG', 'GGG', 'GGC', 'GCA', 'CAA', 'AAC', 'ACA', 'CAA', 'AAG', 'AGA', 'GAG', 'AGC', 'GCA', 'CAA', 'AAA', 'AAA', 'AAC', 'ACT', 'CTC', 'TCT', 'CTG', 'TGT', 'GTC', 'TCT', 'CTC', 'TCA', 'CAC', 'ACA', 'CAA', 'AAA', 'AAA', 'AAC']


In [55]:
x

'GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGATTGCGCCATTGCACTCCAGCCTGGGCAACAAGAGCAAAACTCTGTCTCACAAAAC'

In [56]:
len(sub)

99

Now we reduce to get the feature vector. Let $\Phi_u(x)$ denote the number of occurrences of $u$ in $x$. The
$k$-spectrum kernel is $K(x, x'):= \sum_{u\in A^k} \Phi_u(x) \Phi_u(x')$.

It corresponds to a linear kernel over the feature space. So we may store all sequences in the feature space of all length 3 subsequences. The features will be sparse: at most $|x|-k+1$ non zero features. 

In [57]:
def encode(sub, voc):
    enc = np.zeros(len(voc))
    for s in sub:
        i = voc[s]
        enc[i] += 1
    return enc

In [58]:
encoding = encode(sub, voc)

In [59]:
print(encoding)

[4. 1. 4. 1. 0. 3. 1. 0. 2. 2. 1. 0. 3. 0. 3. 2. 0. 0. 0. 0. 0. 1. 0. 4.
 2. 2. 1. 0. 3. 1. 3. 2. 4. 2. 2. 1. 0. 0. 3. 3. 2. 1. 1. 2. 0. 1. 1. 1.
 2. 1. 0. 6. 0. 1. 1. 2. 3. 1. 3. 1. 3. 2. 1. 2.]


In [60]:
encoding.sum()

99.0

## Build the embedded data matrices (exact matching)

In [61]:
letters = 'ATCG'
length = 5
voc = build_voc(letters, length)
print('Vocabulary has size', len(voc))

Vocabulary has size 1024


In [62]:
for ind in range(3):
    df = pd.read_csv('data/Xtr'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode(substrings(seq, length), voc)
        df_emb.loc[i] = enc
    df_emb.to_csv('data/'
              + 'Xtr' +str(ind) + '_spectr'+str(length)+'.csv', header = False, index = False, sep=" ")

2000it [00:17, 115.29it/s]
2000it [00:19, 104.94it/s]
2000it [00:31, 39.89it/s]


In [63]:
for ind in range(3):
    df = pd.read_csv('data/Xte'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode(substrings(seq, length), voc)
        df_emb.loc[i] = enc
    df_emb.to_csv('data/'
              + 'Xte' +str(ind) + '_spectr'+str(length)+'.csv', header = False, index = False, sep=" ")

1000it [00:05, 181.18it/s]
1000it [00:05, 115.34it/s]
1000it [00:05, 185.48it/s]


## Use suffix tree for mismatches

Building suffix tree using Ukkonen's algorithm (external lib)

In [17]:
from suffix_trees import STree

In [18]:
string = 'BANANA' # has 6 suffixes

In [19]:
st = STree.STree("BANANA")

In [20]:
st.find_all('AN')

[1, 3]

In [21]:
letters = 'ATCG'
length = 3
voc = build_voc(letters, length)

In [22]:
s0 = df['seq'][0]
w = 'AAG'

In [23]:
st0 = STree.STree(s0)

Build words at 1 Hamming distance

In [24]:
def build_1_neighborhood(w):
    nset = []
    for i in range(len(w)):
        for j in letters:
            nset.append(w[:i]+j+w[i+1:])

    nset = list(set(nset))
    return nset

In [25]:
nset = build_1_neighborhood(w)
print(nset)

['AAG', 'AAC', 'AAA', 'ACG', 'TAG', 'AGG', 'ATG', 'CAG', 'GAG', 'AAT']


In [26]:
len(nset) # 3+3+3 + 1 exact matching

10

In [27]:
sum([len(st0.find_all(n)) for n in nset]) # all 1-Hamming matches of w in s0

19

In [28]:
voc_neigh = voc.copy()
for w in voc_neigh:
    voc_neigh[w] = build_1_neighborhood(w)
print(voc_neigh)

{'AAA': ['AGA', 'AAC', 'ATA', 'AAG', 'AAA', 'TAA', 'CAA', 'GAA', 'AAT', 'ACA'], 'AAT': ['GAT', 'AAC', 'AAG', 'CAT', 'TAT', 'AAA', 'AGT', 'ATT', 'ACT', 'AAT'], 'AAC': ['ACC', 'AGC', 'AAC', 'AAG', 'GAC', 'TAC', 'AAA', 'CAC', 'ATC', 'AAT'], 'AAG': ['AAG', 'AAC', 'AAA', 'ACG', 'TAG', 'AGG', 'ATG', 'CAG', 'GAG', 'AAT'], 'ATA': ['AGA', 'GTA', 'CTA', 'ATA', 'AAA', 'TTA', 'ATT', 'ATG', 'ATC', 'ACA'], 'ATT': ['TTT', 'ATA', 'GTT', 'AGT', 'ATC', 'CTT', 'ATT', 'ATG', 'ACT', 'AAT'], 'ATC': ['ACC', 'AGC', 'AAC', 'ATA', 'CTC', 'TTC', 'GTC', 'ATT', 'ATG', 'ATC'], 'ATG': ['CTG', 'AAG', 'ATA', 'ACG', 'GTG', 'TTG', 'AGG', 'ATT', 'ATG', 'ATC'], 'ACA': ['AGA', 'ACC', 'ATA', 'AAA', 'CCA', 'GCA', 'TCA', 'ACG', 'ACT', 'ACA'], 'ACT': ['TCT', 'ACC', 'CCT', 'AGT', 'GCT', 'ATT', 'ACG', 'ACT', 'AAT', 'ACA'], 'ACC': ['ACC', 'GCC', 'AAC', 'AGC', 'TCC', 'ACT', 'CCC', 'ACG', 'ATC', 'ACA'], 'ACG': ['ACC', 'AAG', 'GCG', 'CCG', 'ACT', 'AGG', 'ACG', 'ACA', 'TCG', 'ATG'], 'AGA': ['CGA', 'AGA', 'ATA', 'AAA', 'TGA', 'AGT', '

In [29]:
def compute_1_hamming_embedding(x, length, voc_neigh):
    st = STree.STree(x)
    enc = np.zeros(len(voc_neigh))
    i = 0
    for w in voc_neigh.keys():
        nset = voc_neigh[w]
        enc[i] = sum([len(st.find_all(n)) for n in nset]) # all 1-Hamming matches of w in s0
        i+=1
    return enc

In [30]:
letters = 'ATCG'
length = 3
voc = build_voc(letters, length)
voc_neigh = voc.copy()
for w in voc_neigh:
    voc_neigh[w] = build_1_neighborhood(w)

compute_1_hamming_embedding(s0, length, voc_neigh)

array([21., 18., 17., 19., 13.,  9., 16., 16., 19., 13., 18., 11., 20.,
       18., 19., 14., 15.,  8., 10., 14., 10., 12., 14., 12., 15., 11.,
       13., 14., 17., 15., 15., 18., 17., 13., 18., 22., 12., 15., 12.,
       16., 17., 14., 17., 12., 18.,  9., 16., 15., 23., 16., 20., 16.,
       12., 12., 12., 20., 19., 17., 13., 20., 19., 13., 19., 22.])

In [31]:
letters = 'ATCG'
length = 4
voc = build_voc(letters, length)
voc_neigh = voc.copy()
for w in voc_neigh:
    voc_neigh[w] = build_1_neighborhood(w)

compute_1_hamming_embedding(s0, length, voc_neigh)

array([11.,  7.,  8.,  6.,  5.,  6.,  6.,  3.,  9.,  5.,  8.,  3.,  6.,
        4.,  8.,  3.,  6.,  2.,  2.,  3.,  4.,  3.,  6.,  5.,  4.,  6.,
        5.,  3.,  5.,  4.,  3.,  4.,  7.,  5.,  9.,  4.,  4.,  3.,  5.,
        7.,  8.,  4.,  5.,  3.,  3.,  3.,  4.,  1.,  9.,  6.,  6.,  8.,
        2.,  6.,  5.,  5.,  8.,  7.,  7.,  5.,  7.,  4.,  4.,  5.,  5.,
        2.,  5.,  3.,  0.,  3.,  3.,  2.,  4.,  2.,  2.,  1.,  5.,  0.,
        6.,  3.,  2.,  1.,  4.,  3.,  2.,  2.,  6.,  5.,  5.,  2.,  5.,
        2.,  7.,  5.,  4.,  8.,  8.,  3.,  4.,  6.,  3.,  3.,  6.,  5.,
        3.,  3.,  5.,  5.,  3.,  2.,  6.,  3.,  6.,  5.,  6.,  9.,  4.,
        2.,  3.,  9.,  8.,  4.,  6.,  6.,  6.,  4.,  8.,  6.,  7.,  9.,
        8.,  5.,  5.,  4.,  5.,  5.,  6.,  6.,  7.,  4.,  8.,  6.,  4.,
        7.,  4.,  3.,  2.,  3.,  2.,  5.,  1.,  7.,  6.,  6.,  4.,  5.,
        6.,  5.,  8.,  6.,  8.,  4.,  5.,  6.,  1.,  5.,  4.,  7.,  5.,
        6.,  4.,  5.,  3.,  4.,  4.,  7.,  4.,  2.,  2., 10.,  1

## Build the data matrices (Hamming 1)

In [67]:
letters = 'ATCG'
length = 6
voc = build_voc(letters, length)
voc_neigh = voc.copy()
for w in voc_neigh:
    voc_neigh[w] = build_1_neighborhood(w)
    
print('Vocabulary has size', len(voc))

Vocabulary has size 4096


In [68]:
for ind in range(3):
    df = pd.read_csv('data/Xtr'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = compute_1_hamming_embedding(seq, length, voc_neigh)
        df_emb.loc[i] = enc
    df_emb.to_csv('data/'
              + 'Xtr' +str(ind) + '_spectr'+str(length)+'_hamming1'+'.csv', header = False, index = False, sep=" ")

2000it [20:56,  1.80it/s]
2000it [18:01,  1.82it/s]
2000it [17:47,  1.73it/s]


In [69]:
for ind in range(3):
    df = pd.read_csv('data/Xte'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = compute_1_hamming_embedding(seq, length, voc_neigh)
        df_emb.loc[i] = enc
    df_emb.to_csv('data/'
              + 'Xte' +str(ind) + '_spectr'+str(length)+'_hamming1'+'.csv', header = False, index = False, sep=" ")

1000it [08:28,  1.98it/s]
1000it [08:46,  1.99it/s]
1000it [08:41,  2.00it/s]


## Concatenate some embeddings

In [64]:
first_suffix = 'spectr6'
second_suffix = 'spectr3_hamming1'

In [65]:
for ind in range(3):
    df1 = pd.read_csv('data/Xtr'+str(ind)+'_'+first_suffix+'.csv', header = None, sep=" ")
    print(df1.shape)
    df2 = pd.read_csv('data/Xtr'+str(ind)+'_'+second_suffix+'.csv', header = None, sep=" ")
    print(df2.shape)
    df_emb = pd.concat([df1, df2], axis=1, join='inner')
    print(df_emb.shape)
    df_emb.to_csv('data/'
              + 'Xtr' + str(ind) + '_cat' + first_suffix + '-' 
              + second_suffix +'.csv', header = False, index = False, sep=" ")

(2000, 4096)
(2000, 64)
(2000, 4160)
(2000, 4096)
(2000, 64)
(2000, 4160)
(2000, 4096)
(2000, 64)
(2000, 4160)


In [66]:
for ind in range(3):
    df1 = pd.read_csv('data/Xte'+str(ind)+'_'+first_suffix+'.csv', header = None, sep=" ")
    print(df1.shape)
    df2 = pd.read_csv('data/Xte'+str(ind)+'_'+second_suffix+'.csv', header = None, sep=" ")
    print(df2.shape)
    df_emb = pd.concat([df1, df2], axis=1, join='inner')
    print(df_emb.shape)
    df_emb.to_csv('data/'
              + 'Xte' + str(ind) + '_cat' + first_suffix + '-' 
              + second_suffix +'.csv', header = False, index = False, sep=" ")

(1000, 4096)
(1000, 64)
(1000, 4160)
(1000, 4096)
(1000, 64)
(1000, 4160)
(1000, 4096)
(1000, 64)
(1000, 4160)
