In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import tqdm

In [2]:
df = pd.read_csv('kernel-methods-for-machine-learning-2018-2019/Xtr0.csv')

In [3]:
df.head()

Unnamed: 0,Id,seq
0,0,GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGA...
1,1,ACCCTGCCTACACCGCGGCGGGGACAGGTGGAGGTTTCAACCCCTG...
2,2,TGCAAATCTGTAAGCATTTCTCAGGCAATGAATTATGTCAACACAA...
3,3,GCGGGACGTGGGCGTCGAGGGTAAGGATATCTGCAGAAGTACTGTC...
4,4,GGAGAATAGCATGTATCCGAGAGGTGGAGCTGGCAGTGAGCCGAGA...


All sequences is this set have length 101.

In [4]:
letters = 'ATCG'

In [5]:
length = 3

In [6]:
def build_voc(letters, length):
    vocl = [''.join(x) for x in itertools.product(letters, repeat=length)]
    voc = {}
    i = 0
    for v in vocl:
        voc[v] = i
        i+=1
    return voc
voc = build_voc(letters, length)
print(voc)

{'AAT': 1, 'AGA': 12, 'TAG': 19, 'ACG': 11, 'CTC': 38, 'GGC': 62, 'GCC': 58, 'ATT': 5, 'CAG': 35, 'GGT': 61, 'ATA': 4, 'GTC': 54, 'GCT': 57, 'GTT': 53, 'GGA': 60, 'ATG': 7, 'CTG': 39, 'TTA': 20, 'AGG': 15, 'CAC': 34, 'CGC': 46, 'ACT': 9, 'TCA': 24, 'AGT': 13, 'CCG': 43, 'TGA': 28, 'CCC': 42, 'TAC': 18, 'CCA': 40, 'TGG': 31, 'TCT': 25, 'TCC': 26, 'GTA': 52, 'CTT': 37, 'TTC': 22, 'ACA': 8, 'CGT': 45, 'AGC': 14, 'TGT': 29, 'GAG': 51, 'TTG': 23, 'GAT': 49, 'GCA': 56, 'GAC': 50, 'TCG': 27, 'AAA': 0, 'TTT': 21, 'CGA': 44, 'ATC': 6, 'CCT': 41, 'TAT': 17, 'CAA': 32, 'CGG': 47, 'ACC': 10, 'GTG': 55, 'TAA': 16, 'TGC': 30, 'CTA': 36, 'AAG': 3, 'GGG': 63, 'CAT': 33, 'AAC': 2, 'GCG': 59, 'GAA': 48}


$\Phi_u(x)$ is the number of occurrences of u in x (without gaps) : *spectrum kernel* (Leslie et al., 2002)

In [7]:
def substrings(x, length):
    n = len(x)
    sub = []
    assert n>=length, 'seq too small'
    for i in range(n-length+1):
        curr = x[i:i+length]
        sub.append(curr)
    return sub

In [8]:
x = df['seq'][0]
sub = substrings(x, 3)
print(sub)

['GGA', 'GAG', 'AGA', 'GAA', 'AAT', 'ATC', 'TCA', 'CAT', 'ATT', 'TTT', 'TTG', 'TGA', 'GAA', 'AAC', 'ACC', 'CCC', 'CCG', 'CGG', 'GGG', 'GGA', 'GAG', 'AGG', 'GGT', 'GTG', 'TGG', 'GGA', 'GAG', 'AGG', 'GGT', 'GTT', 'TTG', 'TGC', 'GCC', 'CCG', 'CGT', 'GTG', 'TGA', 'GAG', 'AGC', 'GCT', 'CTG', 'TGA', 'GAG', 'AGA', 'GAT', 'ATT', 'TTG', 'TGC', 'GCG', 'CGC', 'GCC', 'CCA', 'CAT', 'ATT', 'TTG', 'TGC', 'GCA', 'CAC', 'ACT', 'CTC', 'TCC', 'CCA', 'CAG', 'AGC', 'GCC', 'CCT', 'CTG', 'TGG', 'GGG', 'GGC', 'GCA', 'CAA', 'AAC', 'ACA', 'CAA', 'AAG', 'AGA', 'GAG', 'AGC', 'GCA', 'CAA', 'AAA', 'AAA', 'AAC', 'ACT', 'CTC', 'TCT', 'CTG', 'TGT', 'GTC', 'TCT', 'CTC', 'TCA', 'CAC', 'ACA', 'CAA', 'AAA', 'AAA', 'AAC']


In [9]:
x

'GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGATTGCGCCATTGCACTCCAGCCTGGGCAACAAGAGCAAAACTCTGTCTCACAAAAC'

In [10]:
len(sub)

99

Now we reduce to get the feature vector. Let $\Phi_u(x)$ denote the number of occurrences of $u$ in $x$. The
$k$-spectrum kernel is $K(x, x'):= \sum_{u\in A^k} \Phi_u(x) \Phi_u(x')$.

It corresponds to a linear kernel over the feature space. So we may store all sequences in the feature space of all length 3 subsequences. The features will be sparse: at most $|x|-k+1$ non zero features. 

In [11]:
def encode(sub, voc):
    enc = np.zeros(len(voc))
    for s in sub:
        i = voc[s]
        enc[i] += 1
    return enc

In [12]:
encoding = encode(sub, voc)

In [13]:
print(encoding)

[4. 1. 4. 1. 0. 3. 1. 0. 2. 2. 1. 0. 3. 0. 3. 2. 0. 0. 0. 0. 0. 1. 0. 4.
 2. 2. 1. 0. 3. 1. 3. 2. 4. 2. 2. 1. 0. 0. 3. 3. 2. 1. 1. 2. 0. 1. 1. 1.
 2. 1. 0. 6. 0. 1. 1. 2. 3. 1. 3. 1. 3. 2. 1. 2.]


In [14]:
encoding.sum()

99.0

## Build the embedded data matrices (exact matching)

In [15]:
letters = 'ATCG'
length = 4
voc = build_voc(letters, length)
print('Vocabulary has size', len(voc))

Vocabulary has size 256


In [16]:
for ind in range(3):
    df = pd.read_csv('kernel-methods-for-machine-learning-2018-2019/Xtr'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode(substrings(seq, length), voc)
        df_emb.loc[i] = enc
    df_emb.to_csv('kernel-methods-for-machine-learning-2018-2019/'
              + 'Xtr' +str(ind) + '_spectr'+str(length)+'.csv', header = False, index = False, sep=" ")

2000it [00:04, 446.81it/s]
2000it [00:03, 543.79it/s]
2000it [00:03, 546.16it/s]


In [17]:
for ind in range(3):
    df = pd.read_csv('kernel-methods-for-machine-learning-2018-2019/Xte'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode(substrings(seq, length), voc)
        df_emb.loc[i] = enc
    df_emb.to_csv('kernel-methods-for-machine-learning-2018-2019/'
              + 'Xte' +str(ind) + '_spectr'+str(length)+'.csv', header = False, index = False, sep=" ")

1000it [00:01, 665.74it/s]
1000it [00:01, 682.00it/s]
1000it [00:01, 665.64it/s]


## Use suffix tree for mismatches

Building suffix tree using Ukkonen's algorithm (external lib)

In [18]:
from suffix_trees import STree

ImportError: No module named 'suffix_trees'

In [19]:
string = 'BANANA' # has 6 suffixes

In [233]:
st = STree.STree("BANANA")

In [234]:
st.find_all('AN')

[1, 3]

In [235]:
letters = 'ATCG'
length = 3
voc = build_voc(letters, length)

In [236]:
s0 = df['seq'][0]
w = 'AAG'

In [210]:
st0 = STree.STree(s0)

Build words at 1 Hamming distance

In [240]:
def build_1_neighborhood(w):
    nset = []
    for i in range(len(w)):
        for j in letters:
            nset.append(w[:i]+j+w[i+1:])

    nset = list(set(nset))
    return nset

In [242]:
nset = build_1_neighborhood(w)
print(nset)

['AAG', 'AAA', 'CAG', 'TAG', 'AAC', 'ACG', 'GAG', 'ATG', 'AAT', 'AGG']


In [243]:
len(nset) # 3+3+3 + 1 exact matching

10

In [244]:
sum([len(st0.find_all(n)) for n in nset]) # all 1-Hamming matches of w in s0

9

In [248]:
voc_neigh = voc.copy()
for w in voc_neigh:
    voc_neigh[w] = build_1_neighborhood(w)
print(voc_neigh)

{'AAA': ['AAA', 'TAA', 'AAG', 'AGA', 'CAA', 'ATA', 'GAA', 'AAC', 'ACA', 'AAT'], 'AAT': ['TAT', 'ACT', 'AAA', 'AAG', 'GAT', 'AAC', 'ATT', 'AGT', 'AAT', 'CAT'], 'AAC': ['TAC', 'AAA', 'AAG', 'CAC', 'GAC', 'ACC', 'AGC', 'AAC', 'AAT', 'ATC'], 'AAG': ['AAG', 'AAA', 'CAG', 'TAG', 'AAC', 'ACG', 'GAG', 'ATG', 'AAT', 'AGG'], 'ATA': ['TTA', 'AAA', 'ATT', 'AGA', 'ATG', 'ATA', 'CTA', 'ACA', 'GTA', 'ATC'], 'ATT': ['ACT', 'GTT', 'CTT', 'ATA', 'ATT', 'ATG', 'AGT', 'AAT', 'TTT', 'ATC'], 'ATC': ['CTC', 'TTC', 'ACC', 'AGC', 'ATA', 'AAC', 'ATT', 'ATG', 'GTC', 'ATC'], 'ATG': ['ATC', 'AAG', 'GTG', 'AGG', 'TTG', 'ATA', 'ACG', 'ATT', 'ATG', 'CTG'], 'ACA': ['CCA', 'AAA', 'AGA', 'GCA', 'ATA', 'ACC', 'TCA', 'ACA', 'ACG', 'ACT'], 'ACT': ['AAT', 'TCT', 'GCT', 'ACC', 'CCT', 'ACA', 'ATT', 'ACG', 'AGT', 'ACT'], 'ACC': ['CCC', 'GCC', 'TCC', 'ACC', 'AGC', 'AAC', 'ACA', 'ACG', 'ACT', 'ATC'], 'ACG': ['AAG', 'CCG', 'ATG', 'ACC', 'ACA', 'ACG', 'ACT', 'TCG', 'GCG', 'AGG'], 'AGA': ['AAA', 'AGA', 'CGA', 'ATA', 'AGC', 'GGA', '

In [253]:
def compute_1_hamming_embedding(x, length, voc_neigh):
    st = STree.STree(x)
    enc = np.zeros(len(voc_neigh))
    i = 0
    for w in voc_neigh.keys():
        nset = voc_neigh[w]
        enc[i] = sum([len(st.find_all(n)) for n in nset]) # all 1-Hamming matches of w in s0
        i+=1
    return enc

In [255]:
letters = 'ATCG'
length = 3
voc = build_voc(letters, length)
voc_neigh = voc.copy()
for w in voc_neigh:
    voc_neigh[w] = build_1_neighborhood(w)

compute_1_hamming_embedding(s0, length, voc_neigh)

array([ 7.,  5., 12.,  9.,  2.,  3., 10., 16., 11., 15., 20., 15.,  7.,
       10., 21., 14.,  6.,  3., 11.,  8.,  4.,  6., 10., 16., 10., 13.,
       27., 14., 12., 14., 24., 17., 13., 10., 18., 22., 16., 14., 24.,
       19., 23., 24., 36., 32., 13., 13., 27., 29.,  6., 12., 17., 12.,
        6., 13., 20., 16., 20., 22., 35., 22., 15., 18., 27., 24.])

In [256]:
letters = 'ATCG'
length = 4
voc = build_voc(letters, length)
voc_neigh = voc.copy()
for w in voc_neigh:
    voc_neigh[w] = build_1_neighborhood(w)

compute_1_hamming_embedding(s0, length, voc_neigh)

array([ 2.,  3.,  2.,  3.,  1.,  1.,  3.,  3.,  0.,  2.,  7.,  3.,  2.,
        4.,  4.,  5.,  0.,  0.,  1.,  1.,  1.,  2.,  2.,  1.,  1.,  2.,
        6.,  1.,  1.,  2.,  8.,  5.,  2.,  1.,  4.,  4.,  0.,  2.,  4.,
        9.,  8.,  6., 10.,  7.,  1.,  2.,  8.,  4.,  1.,  1.,  4.,  1.,
        0.,  3.,  5.,  2.,  2.,  8., 10.,  4.,  3.,  5.,  9.,  6.,  2.,
        1.,  1.,  2.,  0.,  1.,  1.,  1.,  1.,  2.,  5.,  3.,  0.,  2.,
        3.,  3.,  2.,  0.,  2.,  0.,  1.,  0.,  4.,  1.,  2.,  5.,  4.,
        5.,  2.,  3.,  7.,  4.,  2.,  2.,  4.,  5.,  2.,  2.,  3.,  9.,
        8.,  5., 11.,  8.,  2.,  3.,  5.,  4.,  3.,  4.,  5.,  3.,  1.,
        5.,  5.,  6.,  6., 11., 14.,  5.,  4.,  6., 11.,  8.,  4.,  2.,
        6.,  7.,  1.,  2.,  2.,  5.,  6.,  7.,  6.,  5.,  4.,  4., 10.,
       10.,  3.,  3.,  5.,  5.,  0.,  2.,  5.,  7.,  5.,  5., 11.,  6.,
       10.,  9., 11., 13., 11.,  6.,  8., 10.,  8.,  4., 10., 11., 10.,
       12., 19., 15.,  8.,  6., 12., 18.,  3.,  4.,  5.,  4.,  1

## Build the data matrices (Hamming 1)

In [260]:
letters = 'ATCG'
length = 3
voc = build_voc(letters, length)
voc_neigh = voc.copy()
for w in voc_neigh:
    voc_neigh[w] = build_1_neighborhood(w)
    
print('Vocabulary has size', len(voc))

Vocabulary has size 64


In [261]:
for ind in range(3):
    df = pd.read_csv('kernel-methods-for-machine-learning-2018-2019/Xtr'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = compute_1_hamming_embedding(seq, length, voc_neigh)
        df_emb.loc[i] = enc
    df_emb.to_csv('kernel-methods-for-machine-learning-2018-2019/'
              + 'Xtr' +str(ind) + '_spectr'+str(length)+'_hamming1'+'.csv', header = False, index = False, sep=" ")

2000it [00:39, 50.46it/s]
2000it [01:18, 19.69it/s]
2000it [01:13, 21.24it/s]


In [262]:
for ind in range(3):
    df = pd.read_csv('kernel-methods-for-machine-learning-2018-2019/Xte'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = compute_1_hamming_embedding(seq, length, voc_neigh)
        df_emb.loc[i] = enc
    df_emb.to_csv('kernel-methods-for-machine-learning-2018-2019/'
              + 'Xte' +str(ind) + '_spectr'+str(length)+'_hamming1'+'.csv', header = False, index = False, sep=" ")

1000it [00:35, 28.00it/s]
1000it [00:14, 66.74it/s]
1000it [00:18, 54.37it/s]
