#Aim

Given a large set of sequences or graphs with ordered vertices find small vertex ordered subsequences that are most discriminative for the set.

Steps:
- devise a negative set
- learn a discriminative model
- annotate importance on vertices
- extract max subarrays 
- cluster them 
 - use fast EDeN string kernel 
 - custering algorithm
 
Output: 
1. all sequence motives in each cluster
2. all initial sequences with motif location (begin,end) and cluster id (build regex from all seqs in cluster and run a find iterator)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGU', motives=None, motif_length=6, sequence_length=100, n_sequences=1000, n_motives=2, p=0.2):
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
        
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences / n_motives

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            seqs.append(('>ID%d'%counter,seq))
            counter += 1
    return motives, seqs

In [3]:
from eden.motif import SequenceMotif
help(SequenceMotif)

Help on class SequenceMotif in module eden.motif:

class SequenceMotif(__builtin__.object)
 |  Methods defined here:
 |  
 |  __init__(self, min_subarray_size=7, max_subarray_size=10, min_motif_count=1, min_cluster_size=1, training_size=None, negative_ratio=2, shuffle_order=2, n_iter_search=1, complexity=4, nbits=20, clustering_algorithm=None, n_jobs=4, n_blocks=8, block_size=None, pre_processor_n_jobs=4, pre_processor_n_blocks=8, pre_processor_block_size=None, random_state=1)
 |  
 |  fit(self, seqs, neg_seqs=None)
 |      Builds a discriminative estimator.
 |      Identifies the maximal subarrays in the data.
 |      Clusters them with the clustering algorithm provided in the initialization phase.
 |      For each cluster builds a fast sequence search model (Aho Corasick data structure).
 |  
 |  fit_predict(self, seqs, return_list=False)
 |  
 |  fit_transform(self, seqs, return_match=False)
 |  
 |  load(self, obj)
 |  
 |  predict(self, seqs, return_list=False)
 |      Returns for

#Experimental Setup

In [21]:
#setup parameters
alphabet='ACGU'
motives=['AAAAAAAAAA','CCCCCCCCCC','GGGGGGGGGG','UUUUUUUUUU']
sequence_length=100
n_sequences=1000
p=0.3

#make dataset
motives, seqs = make_artificial_dataset(alphabet=alphabet,motives=motives,sequence_length=sequence_length,n_sequences=n_sequences,p=p)

#display
print 'Motives and sample of their perturbed variants:'
alphabet_list=[c for c in alphabet]
for motif in motives: 
    print
    print motif,
    for i in range(9):
        print perturb(motif,alphabet_list,p=p),

Motives and sample of their perturbed variants:

AAAAAAAAAA AAAAAAAAAG CAAAGCAAAA AAAUGGAAGA AUAGAGAGAA CAAAGAAAAA AAAAAAUAAA CAAAAAACAA UAAAAAAAAA AAGAAAAUGA
CCCCCCCCCC CCCCACCCCC CCCCCCCCGC CCCCCCCCCC CCACCCUCCC CAGCCCCCCC CCCCCCCUCU CUCCCGCCCC GACUCCCCCC CCCGCUGCCC
GGGGGGGGGG UGGUUCGGGG UGGGAGGCUG GGAUGGGGGG GGGGAGAGGG GGGAGUAGGG CAUAGGGAGG GGGAAUGCGC GGCCUGUGGG GAGGGGGCCU
UUUUUUUUUU AUGUUUUAUU UUUUUAUUUU UUUUUUUUUU UUUUUAUUUU UUUUUUUUUU UUUUUCUUUA UUUUUUUUUU UUGAUUUUGU CUCGGUAUUU


In [22]:
#save to file
fname='artificial_motif_search_dataset.fa'
with open(fname,'w') as f:
    for header,seq in seqs: 
        f.write(header+"\n")
        f.write(seq+"\n")

#save explicit negative sequences
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = list(seq_to_seq(seqs, modifier=shuffle_modifier, times=2, order=2))
fname='artificial_motif_search_dataset_negatives.fa'
with open(fname,'w') as f:
    for header,seq in neg_seqs: 
        f.write(header+"\n")
        f.write(seq+"\n")

In [23]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [24]:
%%time
from sklearn.cluster import Birch
ca = Birch(threshold=0.1, n_clusters=4, branching_factor=50)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(complexity=2, nbits=14, clustering_algorithm=ca, min_motif_count=2)
seqmot.fit(seqs, neg_seqs)
seqmot.save('seqmot')

Positive data: Instances: 1000 ; Features: 16385 with an avg of 461 features per instance
Negative data: Instances: 2000 ; Features: 16385 with an avg of 463 features per instance
Elapsed time: 34.7 secs
model induction: 1000 positive instances 37 secs
motives extraction: 625 motives 5 secs
motives clustering: 4 clusters 2 secs
after filtering: 69 motives 4 clusters 0 secs
motif model construction: 0 secs
CPU times: user 29.4 s, sys: 5.65 s, total: 35.1 s
Wall time: 45.8 s


In [25]:
for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count

0
CCCCCCCCCC 19
CCCCCCCCC 9
CCCCCCCCCU 4
CCCCCUCCCC 3
CCCCCCCCU 3
CCCCCCCCCG 3
CCCCCACCCC 3
UCCCCCCCC 2
CUCCCCCCCC 2
CCGCCCCCGC 2
CCCGCCCCCC 2
CCCCUCCCCC 2
CCCCGCCCCC 2
CCCCCCCCUC 2
ACCCCCCCUC 2
1
GGGGGGGGGG 17
GGGGGGGGG 7
UGGGGGGGGG 5
UGGGUGGGG 2
GGUGGGGGGG 2
GGGUGGGGG 2
GGGGUGGGGG 2
GGGGGGGUGG 2
GGGGCGGGGG 2
GGGCGGGGGG 2
GGGCGGGCGG 2
GGCGGGGGGG 2
CGGGGGGGG 2
CGAGGGGGGG 2
AGGGGGGGGG 2
2
UUUUUUUUUU 26
UUUUUUUUU 10
UUUUUUGUU 4
UUUUUUUUG 3
UUUUGUUUUU 3
UUUUCUUUUU 3
UUCUUUUUUU 3
UUCUUUUUU 3
GUUUUUUUUU 3
UUUUUUUUUC 2
UUUUUUUUA 2
UUUUUUUGUU 2
UUUUUUUCUU 2
UUUUUUCUA 2
UUUUUGUUUU 2
UUUUCUUCUU 2
UUUUAUUUUU 2
UUGUUUUUU 2
UUCUUUUUUG 2
UUCUUUUCUU 2
UCUUUUUUUU 2
UCUUUUUUUC 2
UAUUUUUUUU 2
CUUUUUUUUU 2
CUUUUUUUU 2
CUUUUUCUU 2
3
AAAAAAAAAA 21
AAAAAAAAA 16
CAAAAAAAAA 6
CAAAAAAAA 2
AGAAAAAAAA 2
AAGAAAAAAA 2
AAAAGAAAAA 2
AAAAAAUAA 2
AAAAAAGAA 2
AAAAAAAGA 2
AAAAAAACAA 2
AAAAAAACA 2
AAAAAAAAAC 2


In [26]:
from eden.motif import SequenceMotif
seqmot2 = SequenceMotif()
seqmot2.load('seqmot')

predictions=seqmot2.predict(seqs, return_list=True)
for p in predictions: print p

[]
[]
[]
[]
[3]
[]
[]
[]
[]
[0]
[]
[]
[]
[0]
[]
[]
[3]
[]
[]
[2]
[]
[]
[]
[]
[]
[0]
[]
[]
[]
[0]
[]
[2]
[3]
[]
[]
[2]
[]
[0]
[]
[]
[]
[]
[]
[]
[]
[0]
[]
[]
[]
[]
[1]
[1]
[]
[]
[]
[]
[3]
[0]
[]
[]
[]
[0]
[]
[]
[]
[]
[1]
[2]
[3]
[]
[]
[2]
[3]
[]
[]
[]
[]
[]
[]
[]
[3]
[]
[1]
[2]
[]
[]
[]
[2]
[]
[]
[]
[]
[]
[]
[]
[]
[3]
[]
[1]
[2]
[3]
[0]
[]
[]
[]
[]
[1]
[]
[]
[]
[1]
[]
[]
[0]
[]
[]
[3]
[0]
[1]
[]
[]
[]
[1]
[]
[]
[0]
[]
[]
[3]
[]
[1]
[2]
[]
[]
[1]
[2]
[]
[]
[1]
[]
[]
[]
[]
[]
[3]
[]
[]
[2]
[]
[]
[]
[]
[]
[]
[]
[2]
[]
[0]
[]
[2]
[3]
[]
[1]
[2]
[3]
[]
[1]
[]
[]
[]
[]
[]
[]
[]
[]
[2]
[3]
[]
[1]
[2]
[]
[]
[]
[2]
[3]
[]
[1]
[2]
[3]
[]
[]
[]
[]
[0]
[]
[]
[]
[0]
[]
[2]
[]
[]
[]
[]
[]
[]
[]
[2]
[]
[0]
[]
[]
[]
[0]
[1]
[]
[]
[0]
[]
[]
[]
[0]
[1]
[2]
[]
[0]
[]
[]
[]
[]
[]
[2]
[]
[]
[]
[]
[]
[0]
[1]
[2]
[3]
[0]
[]
[]
[]
[]
[]
[2]
[3]
[]
[]
[]
[]
[0]
[]
[]
[]
[]
[]
[]
[]
[]
[1]
[]
[]
[0]
[]
[]
[]
[2]
[]
[2]
[]
[]
[]
[]
[]
[0]
[]
[2]
[]
[0]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[2]
[]
[]
[1]
[]
[3]
[]
[]
[2]
[3]


In [27]:
predictions=seqmot2.predict(seqs, return_list=False)
for p in predictions: print p

0
0
0
0
1
0
0
0
0
1
0
0
0
1
0
0
1
0
0
1
0
0
0
0
0
1
0
0
0
1
0
1
1
0
0
1
0
1
0
0
0
0
0
0
0
1
0
0
0
0
1
1
0
0
0
0
1
1
0
0
0
1
0
0
0
0
1
1
1
0
0
1
1
0
0
0
0
0
0
0
1
0
1
1
0
0
0
1
0
0
0
0
0
0
0
0
1
0
1
1
1
1
0
0
0
0
1
0
0
0
1
0
0
1
0
0
1
1
1
0
0
0
1
0
0
1
0
0
1
0
1
1
0
0
1
1
0
0
1
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
1
0
1
0
1
1
0
1
1
1
0
1
0
0
0
0
0
0
0
0
1
1
0
1
1
0
0
0
1
1
0
1
1
1
0
0
0
0
1
0
0
0
1
0
1
0
0
0
0
0
0
0
1
0
1
0
0
0
1
1
0
0
1
0
0
0
1
1
1
0
1
0
0
0
0
0
1
0
0
0
0
0
1
1
1
1
1
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
1
0
1
0
0
0
0
0
1
0
1
0
1
0
0
0
0
0
0
0
0
0
1
0
0
1
0
1
0
0
1
1
0
0
1
1
0
0
1
1
0
0
1
1
0
0
0
1
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
1
0
0
1
1
0
0
0
0
1
1
1
0
0
0
1
1
0
0
0
1
1
0
0
1
1
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
1
1
1
1
0
0
1
1
0
0
1
0
0
0
0
1
1
1
0
0
0
0
1
1
1
0
1
0
0
1
0
0
0
0
0
1
0
0
0
0
1
0
0
0
1
1
0
1
0
0
1
0
0
0
0
1
0
0
1
0
0
1
0
0
0
0
1
0
1
1
1
1
1
1
0
0
0
0
0
0
1
0
1
1
1
0
0
0
1
0
0
1
1
1
1
1
0
1
0
1
1
0
0
1
0
0
0
0
1
1
0
0
1


In [28]:
predictions=seqmot2.transform(seqs, return_match=True)
for p in predictions: print p

[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(46, 55), (46, 56), (47, 56)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(45, 55)], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(44, 53), (45, 54), (45, 55), (46, 55), (46, 56), (47, 56), (47, 57)], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(42, 52), (43, 53), (45, 54), (45, 55), (46, 55), (46, 56), (48, 57), (48, 58)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [(44, 53), (46, 56), (48, 57)], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(42, 52), (43, 52), (44, 53)], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(43, 52), (44, 53), (44, 54), (45, 54), (45, 55), (46, 55), (46, 56), (47, 56), (47, 57), (48, 57), (48, 58)], [], [], []]
[[], [], [], []]
[[], [], [(45, 55), (46, 56)], []]
[[], [], [], [(45, 54)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [(47, 56)], []]
[[], [],

In [29]:
predictions=seqmot2.transform(seqs, return_match=False)
for p in predictions: print p

[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 1, 0, 0]
[0, 1, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 1]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 1, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]

In [30]:
%%time
from sklearn.cluster import MiniBatchKMeans
ca = MiniBatchKMeans(n_clusters=4)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(clustering_algorithm=ca, min_motif_count=2)
seqmot.fit(seqs)

for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count

Positive data: Instances: 100 ; Features: 1048577 with an avg of 1835 features per instance
Negative data: Instances: 200 ; Features: 1048577 with an avg of 1838 features per instance
Elapsed time: 5.0 secs
model induction: 100 positive instances 5 secs
motives extraction: 624 motives 14 secs
motives clustering: 4 clusters 0 secs
after filtering: 432 motives 4 clusters 0 secs
motif model construction: 0 secs
0
GGGGGGGGGG 17
GGGGGGGGG 8
UGGGGGGGGG 3
GGGGGGGUG 3
GGGAGGGGG 3
GGGGUGGGGG 2
GGGGGGGGU 2
GGGGGGGGGA 2
GGGGGGGGC 2
GGGGCGGGGG 2
AGGGGGGUG 2
AGGGGGGGGG 2
AGGGGGGGG 2
AGGGGGGGA 2
UUUUGGGGUU 1
UUUGCGUGUA 1
UUUCGGGGGG 1
UUUCAGGGGG 1
UUGUAGUGCC 1
UUGGGGGGA 1
UUCCGGGGGU 1
UGUUUUGGGG 1
UGUGGGGGGG 1
UGGGGGUUCG 1
UGGGGGUGGG 1
UGGGGGGUG 1
UGGGGGGGC 1
UGGGGGGAGA 1
UCUUCGUGUG 1
UCGUGCUUC 1
UCGUCGUGGG 1
UCAGAGUGCU 1
UAAACGGGG 1
GUGUGGGGG 1
GUGGGUGGG 1
GUGGGGGGGG 1
GUGGGGGGGA 1
GUGGGGGGG 1
GUGCUGUGGG 1
GGGUUCAGA 1
GGGUGUUCUC 1
GGGUGGGGGG 1
GGGUGGGGG 1
GGGUGAGGGG 1
GGGGUUUCG 1
GGGGUGCUUA 1
GGGGGG

In [32]:
%%time
from sklearn.cluster import DBSCAN
ca = DBSCAN(eps=0.1, min_samples=3)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(clustering_algorithm=ca, min_motif_count=2)
seqmot.fit(seqs)

for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count

Positive data: Instances: 1000 ; Features: 1048577 with an avg of 1834 features per instance
Negative data: Instances: 2000 ; Features: 1048577 with an avg of 1842 features per instance
Elapsed time: 47.8 secs
model induction: 1000 positive instances 50 secs
motives extraction: 759 motives 13 secs
motives clustering: 29 clusters 0 secs
after filtering: 32 motives 28 clusters 0 secs
motif model construction: 0 secs
0
CAAAAAAAAA 11
1
CCCCCCCCCC 14
CCCCCCCCC 9
2
AAAAAAAAAA 18
AAAAAAAAA 7
3
UAAAAAAAAA 4
4
GGGGUGGGGG 3
5
UUUUUUUUUU 20
UUUUUUUUU 14
6
UAAAAAAAG 4
7
AAAAAAAAAC 3
8
GGGGGGGGGG 14
GGGGGGGGG 5
9
CAGAAAAAAU 3
10
GCCCCCCCCG 3
11
UGGGGGGGGG 9
12
GUUUUUUUUA 4
13
CUUUUUUUUU 4
14
UGGGGGGGU 5
15
GCCCCCCCU 3
16
AUUUUUUUUG 3
17
UCCCCCCCCG 4
18
UUUCUUUUUU 3
19
GCCCCCCCCU 3
20
CCCCCCCCCU 3
21
UUUUUUGUUU 3
22
AGGGGGGGA 3
23
UCCCCCCCU 3
24
CCCCCCCCCG 4
25
ACCCCCCCCC 3
26
GUUUUUUUUU 4
27
GGGGGGAGGG 3
CPU times: user 33 s, sys: 7.57 s, total: 40.5 s
Wall time: 1min 4s
