In [1]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                           random_state=1):
    random.seed(random_state)
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    
    sequence_length = sequence_length / len(motives)
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        total_seq = ''
        total_binary_seq=''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
        seqs.append(('>ID%d'%counter,total_seq))
        counter += 1
    binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
    binary_seq = binary_skeleton * n_motives
    return motives, seqs, binary_seq

In [2]:
from sklearn.cluster import KMeans
from utilities import Weblogo, MuscleAlignWrapper
from eden_wrapper import EdenWrapper
from meme_wrapper import Meme

In [3]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [4]:
def run_tool(motif_finder,seqs, n_motives, min_motif_len, max_motif_len):
    if motif_finder=='meme':
        with open('seqs.fa','w') as f_train:
            for seq in seqs:
                f_train.write('>' + seq[0] + ' \n')
                f_train.write(seq[1] + '\n')

        tool =  Meme(alphabet='dna',
                     minw=min_motif_len,
                     maxw=max_motif_len,
                     nmotifs=n_motives)
        tool.fit('seqs.fa')
    else:
        km = KMeans(n_clusters=n_motives)
        tool = EdenWrapper(alphabet='dna', 
                           complexity=5, 
                           nbits=14, 
                           negative_ratio=3,
                           min_subarray_size=min_motif_len, 
                           max_subarray_size=max_motif_len,
                           clustering_algorithm=km)
        tool.fit(seqs)
    return tool

In [5]:
def score_seqs(seqs, n_motives, scoring_criteria, tool):
    scores = []
    for j in range(len(seqs)):
        seq_scr = []
        for k in range(n_motives):
            if scoring_criteria=='pwm':
                scr=tool.score_pwm(motif_num=k+1, seq=seqs[j][1], zero_padding=True)
            else: # scoring_criteria=='hmm'
                scr=tool.score_pwm(motif_num=k+1, seq=seqs[j][1], zero_padding=True)
            seq_scr.append(scr)

        # taking average over all motives for a sequence
        x = np.array(seq_scr[0])
        for k in range(1, n_motives):
            x = np.vstack((x, seq_scr[k]))
        seq_scr = list(np.mean(x, axis=0))
        scores.append(seq_scr)
    return scores
#investigate max or mean

In [6]:
# size of context
# size of motif
# number of motifs
# number of sequences
# 

In [10]:
import numpy as np
from sklearn.metrics import roc_auc_score

def evaluate(motif_finder='meme', # ['meme','eden']
             scoring_criteria='pwm', # ['meme','hmm']
             motif_length=6,
             n_motives=4,
             sequence_length=30,
             n_sequences=50,
             p=0.2,
             random_state=8):

    motives, seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
                                                         motif_length=motif_length,
                                                         n_motives=n_motives,
                                                         sequence_length=sequence_length,
                                                         n_sequences=n_sequences,
                                                         p=p,
                                                         random_state=random_state)

    tool = run_tool(motif_finder=motif_finder,
                    seqs=seqs,
                    n_motives=n_motives,
                    min_motif_len=max(2, len(motives[0])-2),
                    max_motif_len=len(motives[0])+2)
    
    scores = score_seqs(seqs=seqs,
                        n_motives=n_motives,
                        scoring_criteria=scoring_criteria,
                        tool=tool)
    roc_scores = []
    true_score = [float(int(x)) for x in binary_seq]
    for score in scores:
        roc_scores.append(roc_auc_score(true_score, score))
    avg_roc = np.average(roc_scores)
    return avg_roc, roc_scores

In [22]:
%%time
avg_roc, roc_scores = evaluate(motif_finder='meme', # ['meme','eden']
                  scoring_criteria='hmm', # ['pwm','hmm']
                  motif_length=10,
                  n_motives=4,
                  sequence_length=100,
                  n_sequences=100)


INFO:meme_wrapper:The output directory 'meme_out' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 100 sites...
nsites = 2nsites = 3nsites = 4nsites = 5nsites = 6nsites = 7nsites = 8nsites = 9nsites = 10nsites = 11nsites = 12nsites = 13nsites = 14nsites = 15nsites = 16nsites = 17nsites = 18nsites = 19nsites = 20nsites = 21nsites = 22nsites = 23nsites = 24nsites = 25nsites = 26nsites = 27nsites = 28nsites = 29nsites = 30nsites = 31nsites = 32nsites = 33nsites = 34nsites = 35nsites = 36nsites = 37nsites = 38nsites = 39nsites = 40nsites = 41nsites = 42nsites = 43nsites = 44nsites = 45nsites = 46nsites = 47nsites = 48nsites = 49nsites = 50nsites = 51nsites = 52nsites = 53nsites = 54nsites = 55nsites = 56nsites = 57nsites = 58nsites = 59nsites = 60nsites = 61nsites = 62nsites = 63nsites = 64nsites = 65nsites = 66nsites = 67nsites = 68nsites = 69nsites = 70nsites = 71nsit

CPU times: user 1.41 s, sys: 372 ms, total: 1.78 s
Wall time: 15.7 s


In [23]:
print avg_roc
print roc_scores

0.552823660714
[0.52656249999999993, 0.56741071428571421, 0.56339285714285714, 0.54821428571428577, 0.58995535714285718, 0.58191964285714293, 0.5647321428571429, 0.49464285714285705, 0.53705357142857146, 0.57410714285714282, 0.58415178571428572, 0.57678571428571423, 0.54821428571428565, 0.55669642857142854, 0.51250000000000007, 0.53950892857142863, 0.5357142857142857, 0.56874999999999998, 0.5725446428571429, 0.5703125, 0.60892857142857149, 0.56629464285714293, 0.57165178571428577, 0.54843749999999991, 0.52544642857142854, 0.53772321428571423, 0.54397321428571421, 0.56562500000000004, 0.5357142857142857, 0.50290178571428568, 0.58660714285714288, 0.56607142857142867, 0.59843749999999996, 0.54598214285714286, 0.51406249999999998, 0.5658482142857143, 0.57834821428571437, 0.55647321428571439, 0.64330357142857142, 0.58013392857142854, 0.56986607142857137, 0.53236607142857151, 0.58125000000000004, 0.55803571428571419, 0.56607142857142867, 0.53839285714285723, 0.50915178571428577, 0.5508928571

In [17]:
%%time
roc_list=[]
roc_scores_list=[]

motif_len = range(10,21)

for i in motif_len:
    avg_roc, roc_scores = evaluate(motif_finder='eden', # ['meme','eden']
                          scoring_criteria='hmm', # ['pwm','hmm']
                          motif_length=i,
                          n_motives=4,
                          sequence_length=100,
                          n_sequences=100)
    roc_list.append(avg_roc)
    roc_scores_list.append(roc_scores)

DEBUG:eden.util:Positive data: Instances: 100 ; Features: 16385 with an avg of 2434 features per instance
DEBUG:eden.util:Negative data: Instances: 300 ; Features: 16385 with an avg of 2434 features per instance
DEBUG:eden.util:Elapsed time: 8.8 secs
INFO:eden.motif:model induction: 100 positive instances 9 s
INFO:eden.motif:motives extraction: 122 motives in 2s
INFO:eden.motif:motives clustering: 4 clusters in 0s
INFO:eden.motif:after filtering: 101 motives 4 clusters in 0s
INFO:eden.motif:motif model construction in 0s
INFO:eden.motif:updated motif counts in 0s
DEBUG:eden.util:Positive data: Instances: 100 ; Features: 16385 with an avg of 2527 features per instance
DEBUG:eden.util:Negative data: Instances: 300 ; Features: 16385 with an avg of 2527 features per instance
DEBUG:eden.util:Elapsed time: 10.6 secs
INFO:eden.motif:model induction: 100 positive instances 11 s
INFO:eden.motif:motives extraction: 78 motives in 3s
INFO:eden.motif:motives clustering: 4 clusters in 0s
INFO:eden.m

CPU times: user 2min 19s, sys: 23 s, total: 2min 42s
Wall time: 3min 14s


In [20]:
print "Motif Length \t ROC"
for i,r in enumerate(roc_list):
    print "%d \t\t %f"%(motif_len[i],r)

Motif Length 	 ROC
10 		 0.499732
11 		 0.499643
12 		 0.499688
13 		 0.500000
14 		 0.498875
15 		 0.500000
16 		 0.499219
17 		 0.500074
18 		 0.500000
19 		 0.499583
20 		 0.499813
