In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import tqdm

In [2]:
df = pd.read_csv('data/Xtr0.csv')

In [3]:
df.head()

Unnamed: 0,Id,seq
0,0,GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGA...
1,1,ACCCTGCCTACACCGCGGCGGGGACAGGTGGAGGTTTCAACCCCTG...
2,2,TGCAAATCTGTAAGCATTTCTCAGGCAATGAATTATGTCAACACAA...
3,3,GCGGGACGTGGGCGTCGAGGGTAAGGATATCTGCAGAAGTACTGTC...
4,4,GGAGAATAGCATGTATCCGAGAGGTGGAGCTGGCAGTGAGCCGAGA...


All sequences is this set have length 101.

In [4]:
letters = 'ATCG'

In [5]:
length = 6

In [6]:
def build_voc(letters, length):
    vocl = [''.join(x) for x in itertools.product(letters, repeat=length)]
    voc = {}
    i = 0
    for v in vocl:
        voc[v] = i
        i+=1
    return voc
voc = build_voc(letters, length)
# print(voc)

$\Phi_u(x)$ is the number of occurrences of u in x (without gaps) : *spectrum kernel* (Leslie et al., 2002)

In [7]:
def substrings(x, length):
    n = len(x)
    sub = []
    assert n>=length, 'seq too small'
    for i in range(n-length+1):
        curr = x[i:i+length]
        sub.append(curr)
    return sub

In [8]:
x = df['seq'][0]
sub = substrings(x, length)
print(sub)

['GGAGAA', 'GAGAAT', 'AGAATC', 'GAATCA', 'AATCAT', 'ATCATT', 'TCATTT', 'CATTTG', 'ATTTGA', 'TTTGAA', 'TTGAAC', 'TGAACC', 'GAACCC', 'AACCCG', 'ACCCGG', 'CCCGGG', 'CCGGGA', 'CGGGAG', 'GGGAGG', 'GGAGGT', 'GAGGTG', 'AGGTGG', 'GGTGGA', 'GTGGAG', 'TGGAGG', 'GGAGGT', 'GAGGTT', 'AGGTTG', 'GGTTGC', 'GTTGCC', 'TTGCCG', 'TGCCGT', 'GCCGTG', 'CCGTGA', 'CGTGAG', 'GTGAGC', 'TGAGCT', 'GAGCTG', 'AGCTGA', 'GCTGAG', 'CTGAGA', 'TGAGAT', 'GAGATT', 'AGATTG', 'GATTGC', 'ATTGCG', 'TTGCGC', 'TGCGCC', 'GCGCCA', 'CGCCAT', 'GCCATT', 'CCATTG', 'CATTGC', 'ATTGCA', 'TTGCAC', 'TGCACT', 'GCACTC', 'CACTCC', 'ACTCCA', 'CTCCAG', 'TCCAGC', 'CCAGCC', 'CAGCCT', 'AGCCTG', 'GCCTGG', 'CCTGGG', 'CTGGGC', 'TGGGCA', 'GGGCAA', 'GGCAAC', 'GCAACA', 'CAACAA', 'AACAAG', 'ACAAGA', 'CAAGAG', 'AAGAGC', 'AGAGCA', 'GAGCAA', 'AGCAAA', 'GCAAAA', 'CAAAAC', 'AAAACT', 'AAACTC', 'AACTCT', 'ACTCTG', 'CTCTGT', 'TCTGTC', 'CTGTCT', 'TGTCTC', 'GTCTCA', 'TCTCAC', 'CTCACA', 'TCACAA', 'CACAAA', 'ACAAAA', 'CAAAAC']


In [9]:
x

'GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGATTGCGCCATTGCACTCCAGCCTGGGCAACAAGAGCAAAACTCTGTCTCACAAAAC'

In [10]:
len(sub)

96

Now we reduce to get the feature vector. Let $\Phi_u(x)$ denote the number of occurrences of $u$ in $x$. The
$k$-spectrum kernel is $K(x, x'):= \sum_{u\in A^k} \Phi_u(x) \Phi_u(x')$.

It corresponds to a linear kernel over the feature space. So we may store all sequences in the feature space of all length 3 subsequences. The features will be sparse: at most $|x|-k+1$ non zero features. 

In [11]:
def encode(sub, voc):
    enc = np.zeros(len(voc))
    for s in sub:
        i = voc[s]
        enc[i] += 1
    return enc

In [12]:
encoding = encode(sub, voc)

In [13]:
encoding.sum()

96.0

## Compute TF-IDF index of the vocabulary

In [35]:
letters = 'ATCG'
length = 3
voc = build_voc(letters, length)
print('Vocabulary has size', len(voc))

Vocabulary has size 64


In [36]:
tf_idfs = []
for ind in range(3):
    df = pd.read_csv('data/Xtr'+str(ind)+'.csv')
    df2 = pd.read_csv('data/Xte'+str(ind)+'.csv')
    tab = np.concatenate((df.values, df2.values))

    tf = np.zeros((tab.shape[0], len(voc)))
    for j in tqdm.tqdm(range(tf.shape[0])):
        seq = tab[j, 1]
        enc = encode(substrings(seq, length), voc)
        tf[j, :] = enc
    tf = (tf.T/tf.max(axis=1)).T # Frequency of each word in each string
    idf = (tf != 0).sum(axis=0) # Number of strings where each word appears
    tf_idf = tf*idf
    tf_idfs.append(tf_idf)

100%|██████████| 3000/3000 [00:00<00:00, 3187.31it/s]
100%|██████████| 3000/3000 [00:01<00:00, 1871.76it/s]
100%|██████████| 3000/3000 [00:01<00:00, 2343.89it/s]


## Build the embedded data matrices (exact matching)

In [37]:
for ind in range(3):
    df = pd.read_csv('data/Xtr'+str(ind)+'.csv')
    # TF-IDF reweighting
    tf_idf = tf_idfs[ind]
    j = 0
    
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode(substrings(seq, length), voc)
        df_emb.loc[i] = enc * tf_idf[j]
        j+=1
    df_emb.to_csv('data/'
              + 'Xtr' +str(ind) + '_tfidf'+str(length)+'.csv', header = False, index = False, sep=" ")
    
    df = pd.read_csv('data/Xte'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode(substrings(seq, length), voc)
        df_emb.loc[i] = enc * tf_idf[j]
        j+=1
    df_emb.to_csv('data/'
              + 'Xte' +str(ind) + '_tfidf'+str(length)+'.csv', header = False, index = False, sep=" ")

2000it [00:11, 167.85it/s]
1000it [00:04, 216.43it/s]
2000it [00:08, 227.15it/s]
1000it [00:03, 310.96it/s]
2000it [00:08, 241.69it/s]
1000it [00:05, 199.04it/s]
