In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import tqdm
from pprint import pprint

In [10]:
df = pd.read_csv('data/Xtr0.csv')

In [11]:
df.head()

Unnamed: 0,Id,seq
0,0,GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGA...
1,1,ACCCTGCCTACACCGCGGCGGGGACAGGTGGAGGTTTCAACCCCTG...
2,2,TGCAAATCTGTAAGCATTTCTCAGGCAATGAATTATGTCAACACAA...
3,3,GCGGGACGTGGGCGTCGAGGGTAAGGATATCTGCAGAAGTACTGTC...
4,4,GGAGAATAGCATGTATCCGAGAGGTGGAGCTGGCAGTGAGCCGAGA...


All sequences have length 101

In [12]:
aa_to_dna = {'Phe': ['TTT', 'TTC'],
            'Leu': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
            'Ser': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
            'Pro': ['CCT', 'CCC', 'CCA', 'CCG'],
            'Ile': ['ATT', 'ATC', 'ATA'],
            'Met': ['ATG'],
            'Val': ['GTT', 'GTC', 'GTA', 'GTG'],
            'Thr': ['ACT', 'ACC', 'ACA', 'ACG'],
            'Ala': ['GCT', 'GCC', 'GCA', 'GCG'],
            'Tyr': ['TAT', 'TAC'],
            'His': ['CAT', 'CAC'],
            'Gln': ['CAA', 'CAG'],
            'Asn': ['AAT', 'AAC'],
            'Lys': ['AAA', 'AAG'],
            'Asp': ['GAT', 'GAC'],
            'Glu': ['GAA', 'GAG'],
            'Cys': ['TGT', 'TGC'],
            'Trp': ['TGG'],
            'Arg': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
            'Gly': ['GGT', 'GGC', 'GGA', 'GGG'],
            'STOP': ['TAA', 'TAG', 'TGA']
            }

In [13]:
aa_id = {}
i = 0
for v in aa_to_dna.keys():
    aa_id[v] = i
    i+=1

In [16]:
pprint(aa_id)

{'Ala': 8,
 'Arg': 18,
 'Asn': 12,
 'Asp': 14,
 'Cys': 16,
 'Gln': 11,
 'Glu': 15,
 'Gly': 19,
 'His': 10,
 'Ile': 4,
 'Leu': 1,
 'Lys': 13,
 'Met': 5,
 'Phe': 0,
 'Pro': 3,
 'STOP': 20,
 'Ser': 2,
 'Thr': 7,
 'Trp': 17,
 'Tyr': 9,
 'Val': 6}


In [17]:
letters = 'ATCG'
length = 3
def build_voc(letters, length):
    vocl = [''.join(x) for x in itertools.product(letters, repeat=length)]
    voc = {}
    i = 0
    for v in vocl:
        voc[v] = i
        i+=1
    return voc
voc_id = build_voc(letters, length)
print(voc_id)

{'AAA': 0, 'AAT': 1, 'AAC': 2, 'AAG': 3, 'ATA': 4, 'ATT': 5, 'ATC': 6, 'ATG': 7, 'ACA': 8, 'ACT': 9, 'ACC': 10, 'ACG': 11, 'AGA': 12, 'AGT': 13, 'AGC': 14, 'AGG': 15, 'TAA': 16, 'TAT': 17, 'TAC': 18, 'TAG': 19, 'TTA': 20, 'TTT': 21, 'TTC': 22, 'TTG': 23, 'TCA': 24, 'TCT': 25, 'TCC': 26, 'TCG': 27, 'TGA': 28, 'TGT': 29, 'TGC': 30, 'TGG': 31, 'CAA': 32, 'CAT': 33, 'CAC': 34, 'CAG': 35, 'CTA': 36, 'CTT': 37, 'CTC': 38, 'CTG': 39, 'CCA': 40, 'CCT': 41, 'CCC': 42, 'CCG': 43, 'CGA': 44, 'CGT': 45, 'CGC': 46, 'CGG': 47, 'GAA': 48, 'GAT': 49, 'GAC': 50, 'GAG': 51, 'GTA': 52, 'GTT': 53, 'GTC': 54, 'GTG': 55, 'GCA': 56, 'GCT': 57, 'GCC': 58, 'GCG': 59, 'GGA': 60, 'GGT': 61, 'GGC': 62, 'GGG': 63}


Make the inverse dictionary as a sanity check

In [18]:
dna_to_aa = {}
for trp in voc_id.keys():
    dna_to_aa[trp] = [k for (k, v) in aa_to_dna.items() if trp in v][0]

In [19]:
dna_to_aa

{'AAA': 'Lys',
 'AAT': 'Asn',
 'AAC': 'Asn',
 'AAG': 'Lys',
 'ATA': 'Ile',
 'ATT': 'Ile',
 'ATC': 'Ile',
 'ATG': 'Met',
 'ACA': 'Thr',
 'ACT': 'Thr',
 'ACC': 'Thr',
 'ACG': 'Thr',
 'AGA': 'Arg',
 'AGT': 'Ser',
 'AGC': 'Ser',
 'AGG': 'Arg',
 'TAA': 'STOP',
 'TAT': 'Tyr',
 'TAC': 'Tyr',
 'TAG': 'STOP',
 'TTA': 'Leu',
 'TTT': 'Phe',
 'TTC': 'Phe',
 'TTG': 'Leu',
 'TCA': 'Ser',
 'TCT': 'Ser',
 'TCC': 'Ser',
 'TCG': 'Ser',
 'TGA': 'STOP',
 'TGT': 'Cys',
 'TGC': 'Cys',
 'TGG': 'Trp',
 'CAA': 'Gln',
 'CAT': 'His',
 'CAC': 'His',
 'CAG': 'Gln',
 'CTA': 'Leu',
 'CTT': 'Leu',
 'CTC': 'Leu',
 'CTG': 'Leu',
 'CCA': 'Pro',
 'CCT': 'Pro',
 'CCC': 'Pro',
 'CCG': 'Pro',
 'CGA': 'Arg',
 'CGT': 'Arg',
 'CGC': 'Arg',
 'CGG': 'Arg',
 'GAA': 'Glu',
 'GAT': 'Asp',
 'GAC': 'Asp',
 'GAG': 'Glu',
 'GTA': 'Val',
 'GTT': 'Val',
 'GTC': 'Val',
 'GTG': 'Val',
 'GCA': 'Ala',
 'GCT': 'Ala',
 'GCC': 'Ala',
 'GCG': 'Ala',
 'GGA': 'Gly',
 'GGT': 'Gly',
 'GGC': 'Gly',
 'GGG': 'Gly'}

In [20]:
seq = df['seq'][0]
print(seq)

GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGATTGCGCCATTGCACTCCAGCCTGGGCAACAAGAGCAAAACTCTGTCTCACAAAAC


In [21]:
sum(['ATG' in df['seq'][i] for i in range(df.shape[0])])/df.shape[0]

0.7805

ATG is the Methionine which begins the translation: present in 78% of samples -> we cannot throw away the others. It means the sequences are incomplete and we cannot translate...

## Adaptation of the spectrum kernel using genetic coding

### Length 3

In [22]:
def substrings(x, length):
    n = len(x)
    sub = []
    assert n>=length, 'seq too small'
    for i in range(n-length+1):
        curr = x[i:i+length]
        sub.append(curr)
    return sub

In [23]:
len(substrings(seq, 3))

99

In [24]:
def encode(sub, voc):
    enc = np.zeros(len(voc))
    for s in sub:
        i = voc[dna_to_aa[s]]
        enc[i] += 1
    return enc

In [25]:
letters = 'ATCG'
length = 3
voc = aa_id
print('Vocabulary has size', len(voc))

Vocabulary has size 21


In [26]:
for ind in range(3):
    df = pd.read_csv('data/Xtr'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode(substrings(seq, length), voc)
        df_emb.loc[i] = enc
    df_emb.to_csv('data/'
              + 'Xtr' +str(ind) + '_translated'+'.csv', header = False, index = False, sep=" ")


2000it [00:02, 673.28it/s]
0it [00:00, ?it/s]


KeyboardInterrupt: 

In [79]:
for ind in range(3):
    df = pd.read_csv('data/Xte'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode(substrings(seq, length), voc)
        df_emb.loc[i] = enc
    df_emb.to_csv('data/'
              + 'Xte' +str(ind) + '_translated'+'.csv', header = False, index = False, sep=" ")

1000it [00:01, 670.51it/s]
1000it [00:01, 672.96it/s]
1000it [00:01, 759.24it/s]


### Length 6: TODO

Comparison of exact subsequences of size 6, up to the coding -> should be better than allowing random substitutions. Translate each couple of aa and compare them. Vocabulary has size $21\times 21=441$.

In [37]:
voc6 = {}
i = 0
for k in aa_id.keys() :
    for j in aa_id.keys() :
        voc6[k+'-'+j] = i
        i+=1
voc6

{'Phe-Phe': 0,
 'Phe-Leu': 1,
 'Phe-Ser': 2,
 'Phe-Pro': 3,
 'Phe-Ile': 4,
 'Phe-Met': 5,
 'Phe-Val': 6,
 'Phe-Thr': 7,
 'Phe-Ala': 8,
 'Phe-Tyr': 9,
 'Phe-His': 10,
 'Phe-Gln': 11,
 'Phe-Asn': 12,
 'Phe-Lys': 13,
 'Phe-Asp': 14,
 'Phe-Glu': 15,
 'Phe-Cys': 16,
 'Phe-Trp': 17,
 'Phe-Arg': 18,
 'Phe-Gly': 19,
 'Phe-STOP': 20,
 'Leu-Phe': 21,
 'Leu-Leu': 22,
 'Leu-Ser': 23,
 'Leu-Pro': 24,
 'Leu-Ile': 25,
 'Leu-Met': 26,
 'Leu-Val': 27,
 'Leu-Thr': 28,
 'Leu-Ala': 29,
 'Leu-Tyr': 30,
 'Leu-His': 31,
 'Leu-Gln': 32,
 'Leu-Asn': 33,
 'Leu-Lys': 34,
 'Leu-Asp': 35,
 'Leu-Glu': 36,
 'Leu-Cys': 37,
 'Leu-Trp': 38,
 'Leu-Arg': 39,
 'Leu-Gly': 40,
 'Leu-STOP': 41,
 'Ser-Phe': 42,
 'Ser-Leu': 43,
 'Ser-Ser': 44,
 'Ser-Pro': 45,
 'Ser-Ile': 46,
 'Ser-Met': 47,
 'Ser-Val': 48,
 'Ser-Thr': 49,
 'Ser-Ala': 50,
 'Ser-Tyr': 51,
 'Ser-His': 52,
 'Ser-Gln': 53,
 'Ser-Asn': 54,
 'Ser-Lys': 55,
 'Ser-Asp': 56,
 'Ser-Glu': 57,
 'Ser-Cys': 58,
 'Ser-Trp': 59,
 'Ser-Arg': 60,
 'Ser-Gly': 61,
 'Ser-STOP': 62,

In [61]:
def encode_6(sub, voc):
    enc = np.zeros(len(voc))
    for s in sub:
        i = voc[dna_to_aa[s[:3]]+'-'+dna_to_aa[s[3:]]]
        enc[i] += 1
    return enc

In [65]:
letters = 'ATCG'
length = 6
voc = voc6
print('Vocabulary has size', len(voc))

Vocabulary has size 441


In [66]:
for ind in range(3):
    df = pd.read_csv('data/Xtr'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode_6(substrings(seq, length), voc)
        df_emb.loc[i] = enc
    df_emb.to_csv('data/'
              + 'Xtr' +str(ind) + '_trans6'+'.csv', header = False, index = False, sep=" ")


2000it [00:08, 237.49it/s]
2000it [00:09, 222.05it/s]
2000it [00:08, 223.10it/s]


In [67]:
for ind in range(3):
    df = pd.read_csv('data/Xte'+str(ind)+'.csv')
    df_emb = pd.DataFrame(columns = [str(i) for i in range(len(voc))])
    for _, r in tqdm.tqdm(df.iterrows()):
        i = r['Id']
        seq = r['seq']
        enc = encode_6(substrings(seq, length), voc)
        df_emb.loc[i] = enc
    df_emb.to_csv('data/'
              + 'Xte' +str(ind) + '_trans6'+'.csv', header = False, index = False, sep=" ")


1000it [00:03, 307.38it/s]
1000it [00:03, 242.75it/s]
1000it [00:02, 338.78it/s]
