#Aim

Given a large set of sequences or graphs with ordered vertices find small vertex ordered subsequences that are most discriminative for the set.

Steps:
- devise a negative set
- learn a discriminative model
- annotate importance on vertices
- extract max subarrays 
- cluster them 
 - use fast EDeN string kernel 
 - custering algorithm
 
Output: 
1. all sequence motives in each cluster
2. all initial sequences with motif location (begin,end) and cluster id (build regex from all seqs in cluster and run a find iterator)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGU', motives=None, motif_length=6, sequence_length=100, n_sequences=1000, n_motives=2, p=0.2):
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
        
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences / n_motives

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            seqs.append(('>ID%d'%counter,seq))
            counter += 1
    return motives, seqs

#Experimental Setup

In [3]:
#setup parameters
alphabet='ACGU'
motives=['AAAAAAAAAA','CCCCCCCCCC','GGGGGGGGGG','UUUUUUUUUU']
sequence_length=100
n_sequences=100
p=0.3

#make dataset
motives, seqs = make_artificial_dataset(alphabet=alphabet,motives=motives,sequence_length=sequence_length,n_sequences=n_sequences,p=p)

#display
print 'Motives and sample of their perturbed variants:'
alphabet_list=[c for c in alphabet]
for motif in motives: 
    print
    print motif,
    for i in range(9):
        print perturb(motif,alphabet_list,p=p),

Motives and sample of their perturbed variants:

AAAAAAAAAA AAAAACAGCA CGAAUAAAAA AAAAAAGAAA AAAAAUUCAA AAAAAUAUUA AAAACAAAAA ACAAAGAAAA AAAGAAAAUG AAAAAAAACU
CCCCCCCCCC CUCCUCCCCC CCCCCCCCCC CGGACCCCCC CCCCACCCCC CGCCCCCUUC CCUCCCCCCC CCCACCUCCU ACCGCCCCCC CCCCCCCCCC
GGGGGGGGGG GGGGGGAGAG GGGGGGGGGG UGUGGGGGAG GGGAGGCGGG GGGUGCGGGG GUGAGGAGGC GGGGGCGGUG UCGGGAGGGG AAGGGUGGAA
UUUUUUUUUU UUUUCUUGUU UUUCUUCUUU GUUUCUUUUU CUUGUUUUUU UAUGUGUUAU UUGUUUUUUG UUGUUUUUUU UUUCUUUUUU CUUUUAUUUU


In [4]:
#save to file
fname='artificial_motif_search_dataset.fa'
with open(fname,'w') as f:
    for header,seq in seqs: 
        f.write(header+"\n")
        f.write(seq+"\n")

In [6]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [7]:
%%time
from sklearn.cluster import Birch
clustering_algorithm = Birch(threshold=0.1, n_clusters=4, branching_factor=50)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(training_size=100, 
                       complexity=2,
                       nbits=14,
                       clustering_algorithm=clustering_algorithm)
seqmot.fit(seqs)
seqmot.save('seqmot')

model induction: 100 positive instances 7 secs
motives extraction: 50 motives 2 secs
motives clustering: 4 clusters 0 secs
after filtering: 40 motives 4 clusters 0 secs
motif model construction: 0 secs


CPU times: user 2.64 s, sys: 412 ms, total: 3.06 s
Wall time: 10.1 s


In [7]:
for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count

0
UUUUUUUUUU 22
UUUUUUUUU 6
UUUUUUUUUC 3
UUUUUUCUUU 3
UUUUUUUCUU 2
UCUUUUUUUU 2
UCUUUUUUG 2
CUUUUUUUUU 2
CUUUUUUUUC 2
CUUUUUUUCU 2
AUUUUUUUUA 2
UUUUUUUUG 1
UUUUUUUGU 1
UUUUUUUCU 1
UUUUUUGUG 1
UUUUUUCGC 1
UUUUUUCCUA 1
UUUUUGCCCG 1
UUUUUCUUUU 1
UUUUUCUCUU 1
UUUUGUUUUU 1
UUUUGUUUUC 1
UUUUGUUUU 1
UUUUCUUUUU 1
UUUUCUUUUC 1
UUUUCUUGUU 1
UUUUCUGUGC 1
UUUUCUCUUU 1
UUUUCAGUGC 1
UUUCUUUUUC 1
UUUCUUUUU 1
UUUCUUUUC 1
UUUCUUCUUU 1
UUUCUGUGA 1
UUUCCUUUUU 1
UUUAUUUUU 1
UUGUUCUGCG 1
UUGCUCUUA 1
UUGCGUUUUU 1
UUGCGUGUGC 1
UUGCGUGCU 1
UUGCGUGCGU 1
UUGCGUAAGG 1
UUCUUUUUUC 1
UUCUUUUUU 1
UUCUUUUUCU 1
UUCUUUUCUU 1
UUCUUGCUUU 1
UUCUUCUCCG 1
UUCUUCCCC 1
UUCUUCCC 1
UGUUUUUUU 1
UGUUUUUUC 1
UGUUUUUGCG 1
UGUGUGCGAU 1
UGUGCUUUC 1
UGCGUUUUU 1
UGCGCUUUC 1
UCUUUUUUUG 1
UCUUUUUCUU 1
UCUUUUUCU 1
UCUUUUCCCC 1
UCUUGCUUCU 1
UCUUCGAGAA 1
UCUUAGUGC 1
UCGUGAUUG 1
UCGCUUUUC 1
UCGCGCUUUG 1
UAUUUUUUUC 1
UAGCGUUG 1
UAGCGUGCG 1
GUUUUUUUUU 1
GUUUUUUUU 1
GUUUUUUUCU 1
GUUUUUUUA 1
GUUUUUUGUG 1
GUUUUUUCUU 1
GUUUUUUAUA 1
GUUUUUGUUG 1
GU

In [8]:
from eden.motif import SequenceMotif
seqmot2 = SequenceMotif()
seqmot2.load('seqmot')

predictions=seqmot2.predict(seqs, return_list=True)
for p in predictions: print p

[1]
[]
[]
[0, 2]
[]
[2]
[]
[0]
[0, 1]
[0, 1]
[3]
[0, 2]
[3]
[]
[]
[0]
[]
[2]
[1, 3]
[0]
[0, 1]
[]
[3]
[0]
[1]
[2]
[0, 3]
[]
[1]
[2]
[]
[]
[1]
[0]
[0, 3]
[1]
[1]
[2]
[]
[0]
[1]
[2]
[0, 3]
[3]
[2]
[]
[3]
[0]
[1]
[2]
[]
[0]
[1]
[2]
[3]
[]
[1]
[2]
[2]
[]
[1]
[2]
[3]
[]
[]
[2]
[]
[2]
[3]
[0, 2]
[0]
[]
[0, 1]
[]
[0]
[0]
[]
[2]
[]
[0]
[1]
[2]
[3]
[0]
[]
[2]
[1]
[]
[]
[]
[]
[0]
[1]
[2]
[]
[]
[1, 3]
[2]
[3]
[0]
[]
[]
[3]
[0]
[1]
[]
[3]
[]
[]
[2]
[0]
[3]
[1]
[2]
[3]
[0]
[1]
[]
[]
[0]
[1]
[0, 2]
[0, 3]
[]
[0]
[2]
[2]
[0]
[0, 1]
[2]
[]
[]
[1]
[]
[]
[0]
[]
[2]
[0, 3]
[0]
[1]
[]
[3]
[]
[1]
[0]
[]
[0, 2]
[1]
[2]
[2, 3]
[0]
[2]
[2]
[1]
[0]
[]
[]
[]
[]
[1, 3]
[]
[0]
[2]
[1]
[]
[3]
[0]
[1]
[2]
[3]
[0]
[]
[2]
[3]
[0]
[1]
[2]
[]
[]
[1]
[2]
[3]
[0]
[]
[2]
[0]
[0]
[1]
[2]
[2, 3]
[0, 2]
[1]
[]
[]
[0, 1]
[]
[2]
[]
[0]
[]
[2]
[]
[3]
[1]
[2]
[]
[0]
[1]
[2]
[]
[3]
[0, 1]
[2]
[3]
[0]
[]
[3]
[]
[0]
[0]
[]
[3]
[0]
[3]
[]
[3]
[]
[1]
[2, 3]
[0, 1]
[0]
[1, 2]
[1, 2]
[3]
[0]
[2]
[1, 2]
[3]
[0]
[]
[2]
[3]
[0]
[]
[]
[0]


In [9]:
predictions=seqmot2.predict(seqs, return_list=False)
for p in predictions: print p

1
0
0
1
0
1
0
1
1
1
1
1
1
0
0
1
0
1
1
1
1
0
1
1
1
1
1
0
1
1
0
0
1
1
1
1
1
1
0
1
1
1
1
1
1
0
1
1
1
1
0
1
1
1
1
0
1
1
1
0
1
1
1
0
0
1
0
1
1
1
1
0
1
0
1
1
0
1
0
1
1
1
1
1
0
1
1
0
0
0
0
1
1
1
0
0
1
1
1
1
0
0
1
1
1
0
1
0
0
1
1
1
1
1
1
1
1
0
0
1
1
1
1
0
1
1
1
1
1
1
0
0
1
0
0
1
0
1
1
1
1
0
1
0
1
1
0
1
1
1
1
1
1
1
1
1
0
0
0
0
1
0
1
1
1
0
1
1
1
1
1
1
0
1
1
1
1
1
0
0
1
1
1
1
0
1
1
1
1
1
1
1
1
0
0
1
0
1
0
1
0
1
0
1
1
1
0
1
1
1
0
1
1
1
1
1
0
1
0
1
1
0
1
1
1
0
1
0
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
1
0
0
1
1
1
0
1
1
1
1
0
0
1
1
1
1
1
1
1
0
1
0
0
0
1
1
0
0
0
1
1
1
0
1
1
1
1
0
0
0
1
1
0
0
1
1
1
0
1
1
1
0
0
1
0
1
1
1
0
1
1
1
1
1
1
1
0
1
1
0
1
1
0
1
1
0
0
1
0
1
1
0
0
0
1
1
0
1
1
1
0
1
0
1
1
0
1
0
1
0
0
1
1
1
0
1
1
1
1
1
1
0
0
1
0
0
1
1
0
1
0
1
0
1
1
1
1
1
1
0
1
0
0
1
1
1
1
0
0
1
1
1
1
0
1
1
1
1
0
0
1
0
1
0
0
0
1
0
1
1
1
1
0
0
1
0
1
1
1
1
1
0
1
0
1
1
1
0
1
1
1
1
1
1
0
1
0
1
0
0
1
0
1
1
0
0
0
1
1
0
1
1
0
1
1
1
1
1
0
1
1
1
1
1
0
1
0
1
1
0
1
1
0
1
1
1
0
1
0
1
1
0
0
1
1
1
1
1
0
1
1
1
1
1
0
0
0
1
1
1
0
1
0
0
1
0


In [10]:
predictions=seqmot2.transform(seqs, return_match=True)
for p in predictions: print p

[[], [(46, 56), (47, 56), (48, 57)], [], []]
[[], [], [], []]
[[], [], [], []]
[[(46, 56)], [], [(17, 26)], []]
[[], [], [], []]
[[], [], [(47, 56)], []]
[[], [], [], []]
[[(46, 56)], [], [], []]
[[(1, 10)], [(43, 52), (44, 53), (44, 54)], [], []]
[[(11, 20)], [(0, 9)], [], []]
[[], [], [], [(44, 53)]]
[[(10, 19), (57, 67)], [], [(90, 100)], []]
[[], [], [], [(57, 67)]]
[[], [], [], []]
[[], [], [], []]
[[(43, 53)], [], [], []]
[[], [], [], []]
[[], [], [(46, 55), (46, 56), (47, 56), (48, 58)], []]
[[], [(29, 39), (33, 42)], [], [(44, 53), (44, 54), (45, 54), (45, 55), (46, 55), (47, 56), (48, 58)]]
[[(44, 54), (46, 55)], [], [], []]
[[(21, 31)], [(9, 19), (45, 54), (46, 55), (77, 87)], [], []]
[[], [], [], []]
[[], [], [], [(46, 55)]]
[[(50, 60)], [], [], []]
[[], [(44, 53), (44, 54), (45, 55)], [], []]
[[], [], [(44, 53), (44, 54), (45, 54), (45, 55), (46, 55), (46, 56)], []]
[[(36, 46)], [], [], [(3, 12)]]
[[], [], [], []]
[[], [(46, 56), (47, 56)], [], []]
[[], [], [(48, 57)], []]


In [11]:
predictions=seqmot2.transform(seqs, return_match=False)
for p in predictions: print p

[0, 1, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 1, 0]
[0, 0, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[1, 1, 0, 0]
[1, 1, 0, 0]
[0, 0, 0, 1]
[1, 0, 1, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 1, 0]
[0, 1, 0, 1]
[1, 0, 0, 0]
[1, 1, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 1]
[1, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[1, 0, 0, 1]
[0, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 1, 0, 0]
[1, 0, 0, 0]
[1, 0, 0, 1]
[0, 1, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[1, 0, 0, 1]
[0, 0, 0, 1]
[0, 0, 1, 0]
[0, 0, 0, 0]
[0, 0, 0, 1]
[1, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[0, 0, 1, 0]
[0, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 1]
[1, 0, 1, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[1, 1, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]

In [12]:
%%time
from sklearn.cluster import MiniBatchKMeans
clustering_algorithm = MiniBatchKMeans(n_clusters=4)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(training_size=100, clustering_algorithm=clustering_algorithm)
seqmot.fit(seqs)

for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count

model induction: 100 positive instances 16 secs
motives extraction: 728 motives 49 secs
motives clustering: 4 clusters 1 secs
after filtering: 531 motives 4 clusters 0 secs
motif model construction: 0 secs


0
CCCCCCCCCC 14
CCCCCCCCC 8
ACCCCCCCCG 6
UCCCCCCCU 4
GCCCCCCCCG 4
GCCCCCCCCA 4
UCCCCCCCCC 3
CCCCCCCCG 3
CCCCCCCCCG 3
ACCCCCCCCU 3
ACCCCCCCA 3
UCUCCCCCG 2
UCCCCCCCCU 2
UCCCCCCCA 2
GCCUCCCCG 2
GCCCCCCCU 2
GCCCCCCCG 2
GCCCCCCCA 2
CCCUCCCCCC 2
CCCCUCCCCC 2
CCCCCCCCU 2
CCCCCCCCA 2
CCCCACCCCC 2
ACCCCCCCU 2
ACCCCCCCCC 2
UUCCCCCCU 1
UGCCCGAAU 1
UGCCCCACCC 1
UCUUCCCGUG 1
UCUCCCCCU 1
UCUAGUGCCC 1
UCCGCCCCCG 1
UCCCUUGUG 1
UCCCUUCUC 1
UCCCUCCCC 1
UCCCCUCUUU 1
UCCCCUCCU 1
UCCCCGCGUG 1
UCCCCGAACA 1
UCCCCCUUGA 1
UCCCCCUCG 1
UCCCCCGUG 1
UCCCCCGGGG 1
UCCCCCGCCG 1
UCCCCCGCAA 1
UCCCCCCCG 1
UCCCCCCCCG 1
UCCCCCCCC 1
UCCCCACCCG 1
UAGCCCCCCU 1
GUUUCCCCCU 1
GUGCCCCCU 1
GGAACCCCCU 1
GCGUUCCCC 1
GCCGCCCCC 1
GCCCUCCCU 1
GCCCUCCCCA 1
GCCCGUGGGG 1
GCCCGGGGA 1
GCCCCUGUGU 1
GCCCCUCCCA 1
GCCCCCUCU 1
GCCCCCUCCU 1
GCCCCCGAAG 1
GCCCCCCCCC 1
GCCCCCCAA 1
GCCCCAACCC 1
GCCCCAACA 1
GCCCACCCCA 1
GCCCAACCAA 1
GAAGCCCCCG 1
CUUCCCCCU 1
CUCUCCCCG 1
CGCCCGAUGA 1
CCUCCCCCCG 1
CCUAGUAGC 1
CCGUGCGUGA 1
CCGCCCCCG 1
CCCUUCUUUC 1
CCCUUC

In [13]:
%%time
from sklearn.cluster import DBSCAN
clustering_algorithm = DBSCAN(eps=0.2, min_samples=3)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(training_size=100, clustering_algorithm=clustering_algorithm)
seqmot.fit(seqs)

for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count

model induction: 100 positive instances 15 secs
motives extraction: 693 motives 48 secs
motives clustering: 11 clusters 1 secs
after filtering: 239 motives 10 clusters 0 secs
motif model construction: 0 secs


0
AAAAAAAAAA 10
CAAAAAAAAA 6
AAAAAAAAA 6
CAAAAAAAAC 4
UAAAAAAAU 3
GAAAAAAAG 3
GAAAAAAAAA 3
CAAAAACAAG 3
AAAAAAAAAC 3
UAAAAAAAC 2
UAAAAAAAAU 2
GAAAAAAAA 2
CAAAAAAAC 2
CAAAAAAAAG 2
CAAAAAAAA 2
AACAAAAAAC 2
AACAAAAAAA 2
AAAAAACAAA 2
AAAAAAAAAG 2
UAAAAAUAAA 1
UAAAAAGAAA 1
UAAAAACAAG 1
UAAAAAAAAG 1
UAAAAAAAAC 1
UAAAAAAAAA 1
GAAAAAUAG 1
GAAAAAUAAU 1
GAAAAAUAAA 1
GAAAAAGAAU 1
GAAAAAAGAG 1
GAAAAAACAA 1
GAAAAAAAU 1
GAAAAAAAAU 1
GAAAAAAAAG 1
CAUAAAAAAU 1
CAACAAAAAG 1
CAACAAAAAA 1
CAAAAACAA 1
CAAAAAAAU 1
CAAAAAAACC 1
CAAAAAAAAU 1
AUAAAAAAU 1
AACAAAAAAU 1
AAAUAAAAAU 1
AAAUAAAAAG 1
AAAUAAAAA 1
AAACAAAAAC 1
AAACAAAAAA 1
AAACAAAAA 1
AAAAUAAAAC 1
AAAAGAAAG 1
AAAAAUAAG 1
AAAAAUAAAG 1
AAAAAUAAA 1
AAAAAGAAGA 1
AAAAAGAAA 1
AAAAACAAAC 1
AAAAACAAAA 1
AAAAAAGGGG 1
AAAAAAGAAA 1
AAAAAACAAU 1
AAAAAAAGAA 1
AAAAAAAAG 1
AAAAAAAAC 1
1
UUUUUUUUUU 20
UUUUUUUUU 12
UUUUUUUUUC 3
UUUUUUCUUU 3
CUUUUUUUCU 3
AUUUUUUUG 3
UUUUUUUUG 2
UUUCUUUUUC 2
UUCUUUUUUU 2
GUUUUUUUUU 2
GUUUUUUUU 2
GUUUUUCUU 2
CUUUUUUUUC 2
CUUUUUUUU 2
CUUUU