In [1]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                           random_state=1):
    random.seed(random_state)
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    
    sequence_length = sequence_length / len(motives)
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        total_seq = ''
        total_binary_seq=''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
        seqs.append(('>ID%d'%counter,total_seq))
        counter += 1
    binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
    binary_seq = binary_skeleton * n_motives
    return motives, seqs, binary_seq

In [2]:
from sklearn.cluster import KMeans
from utilities import Weblogo, MuscleAlignWrapper
from eden_wrapper import EdenWrapper
from meme_wrapper import Meme

In [3]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [4]:
def run_tool(motif_finder,seqs, n_motives, min_motif_len, max_motif_len):
    if motif_finder=='meme':
        with open('seqs.fa','w') as f_train:
            for seq in seqs:
                f_train.write('>' + seq[0] + ' \n')
                f_train.write(seq[1] + '\n')

        tool =  Meme(alphabet='dna',
                     minw=min_motif_len,
                     maxw=max_motif_len,
                     nmotifs=n_motives,
                     maxsize=100000)
        tool.fit('seqs.fa')
    else:
        km = KMeans(n_clusters=n_motives)
        tool = EdenWrapper(alphabet='dna', 
                           complexity=5, 
                           nbits=14, 
                           negative_ratio=3,
                           min_subarray_size=min_motif_len, 
                           max_subarray_size=max_motif_len,
                           clustering_algorithm=km)
        tool.fit(seqs)
    return tool

In [5]:
def score_seqs(seqs, n_motives, scoring_criteria, tool):
    scores = []
    for j in range(len(seqs)):
        seq_scr = []
        for k in range(n_motives):
            if scoring_criteria=='pwm':
                scr=tool.score(motif_num=k+1, seq=seqs[j][1], zero_padding=True)
            else: # scoring_criteria=='hmm'
                scr=tool.score(motif_num=k+1, seq=seqs[j][1], zero_padding=True)
            seq_scr.append(scr)

        # taking average over all motives for a sequence
        x = np.array(seq_scr[0])
        for k in range(1, n_motives):
            x = np.vstack((x, seq_scr[k]))
        seq_scr = list(np.mean(x, axis=0))
        scores.append(seq_scr)
    return scores
#investigate max or mean

In [6]:
import numpy as np
from sklearn.metrics import roc_auc_score

def evaluate(motif_finder='meme', # ['meme','eden']
             scoring_criteria='pwm', # ['meme','hmm']
             motives=None,
             motif_length=6,
             n_motives=4,
             sequence_length=100,
             n_sequences=50,
             p=0.2,
             random_state=8):

    motives, seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
                                                        motives=motives,
                                                        sequence_length=sequence_length,
                                                        n_sequences=n_sequences,
                                                        motif_length=motif_length,
                                                        n_motives=n_motives,
                                                        p=p,
                                                        random_state=random_state)
    tool = run_tool(motif_finder=motif_finder,
                    seqs=seqs,
                    n_motives=n_motives,
                    min_motif_len=max(2, len(motives[0])-2),
                    max_motif_len=len(motives[0])+2)
    
    scores = score_seqs(seqs=seqs,
                        n_motives=n_motives,
                        scoring_criteria=scoring_criteria,
                        tool=tool)
    roc_scores = []
    #true_score = [float(int(x)) for x in binary_seq]
    true_score = np.zeros(len(binary_seq))
    for i in range(1,len(binary_seq)):
        if binary_seq[i-1]=='0' and binary_seq[i]=='1':
            true_score[i]=1

    for score in scores:
        roc_scores.append(roc_auc_score(true_score, score))
    avg_roc = np.average(roc_scores)
    max_roc = max(roc_scores)
    return avg_roc, max_roc, roc_scores

<h3>Experiment 1: Varying number of motives</h3>

In [72]:
%%time
avg_roc_list=[]
max_roc_list=[]
roc_scores_list=[]

parameter = range(4,9)

for i in parameter:
    avg_roc, max_roc, roc_scores = evaluate(motif_finder='meme', # ['meme','eden']
                                            scoring_criteria='pwm', # ['pwm','hmm']
                                            motif_length=15,
                                            n_motives=i,
                                            sequence_length=300,
                                            n_sequences=200,
                                            p=0.01)
    avg_roc_list.append(avg_roc)
    max_roc_list.append(max_roc)
    roc_scores_list.append(roc_scores)

DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
CPU times: user 1min 11s, sys: 544 ms, total: 1min 11s
Wall time: 15min 47s


In [73]:
print "Number of Motives \t Avg-ROC \t Max-ROC"
for i,r in enumerate(avg_roc_list):
    print "%d \t\t %f \t %f"%(parameter[i],r, max_roc_list[i])

Number of Motives 	 Avg-ROC 	 Max-ROC
4 		 1.000000 	 1.000000
5 		 1.000000 	 1.000000
6 		 1.000000 	 1.000000
7 		 1.000000 	 1.000000
8 		 1.000000 	 1.000000


<h3>Experiment 2: Varying Motif Length</h3>

In [74]:
%%time
avg_roc_list=[]
max_roc_list=[]
roc_scores_list=[]

parameter = range(5,11)

for i in parameter:
    avg_roc, max_roc, roc_scores = evaluate(motif_finder='meme', # ['meme','eden']
                          scoring_criteria='hmm', # ['pwm','hmm']
                          motif_length=i,
                          n_motives=4,
                          sequence_length=150,
                          n_sequences=100,
                          p=0.2)
    avg_roc_list.append(avg_roc)
    max_roc_list.append(max_roc)
    roc_scores_list.append(roc_scores)

DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
CPU times: user 7.66 s, sys: 172 ms, total: 7.84 s
Wall time: 2min 58s


In [75]:
print "Motif Length \t Avg-ROC \t Max-ROC"
for i,r in enumerate(avg_roc_list):
    print "%d \t\t %f \t %f"%(parameter[i],r,max_roc_list[i])

Motif Length 	 Avg-ROC 	 Max-ROC
5 		 0.741806 	 1.000000
6 		 0.712509 	 1.000000
7 		 0.852135 	 1.000000
8 		 0.930786 	 1.000000
9 		 0.945955 	 1.000000
10 		 0.976714 	 1.000000


<h3>Experiment 3: Fixed Motives with Varying Sequence Length</h3>

In [77]:
%%time

semi_len=5
motives=['A'*semi_len+'C'*semi_len,
         'C'*semi_len+'A'*semi_len,
         'A'*semi_len+'T'*semi_len,
         'T'*semi_len+'A'*semi_len,
         'A'*semi_len+'G'*semi_len,
         'G'*semi_len+'A'*semi_len,
         'G'*semi_len+'C'*semi_len,
         'C'*semi_len+'G'*semi_len,]

avg_roc_list=[]
max_roc_list=[]
roc_scores_list=[]

parameter = range(250,410,50)

for i in parameter:
    avg_roc, max_roc, roc_scores = evaluate(motif_finder='meme', # ['meme','eden']
                                            scoring_criteria='pwm', # ['pwm','hmm']
                                            motives=motives,
                                            sequence_length=i,
                                            n_sequences=200,
                                            p=0.1)
    avg_roc_list.append(avg_roc)
    max_roc_list.append(max_roc)
    roc_scores_list.append(roc_scores)

DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
CPU times: user 28.7 s, sys: 276 ms, total: 29 s
Wall time: 18min 26s


In [78]:
print "Sequence Length \t Avg-ROC \t Max-ROC"
for i,r in enumerate(avg_roc_list):
    print "%d \t\t %f \t %f"%(parameter[i],r,max_roc_list[i])

Sequence Length 	 Avg-ROC 	 Max-ROC
250 		 0.732997 	 0.855334
300 		 0.742863 	 0.905357
350 		 0.739217 	 0.904535
400 		 0.731756 	 0.801180


<h3>Experiment 4: Varying Motif Length (Fixed Motives)</h3>

In [79]:
%%time

parameter = range(10,31,5)

avg_roc_list=[]
max_roc_list=[]
roc_scores_list=[]

for i in parameter:
    motives=['A'*i,
             'C'*i,
             'A'*i,
             'C'*i]
    
    avg_roc, max_roc, roc_scores = evaluate(motif_finder='eden', # ['meme','eden']
                                            scoring_criteria='hmm', # ['pwm','hmm']
                                            motives=motives,
                                            sequence_length=300,
                                            n_sequences=200,
                                            p=0.1)
    avg_roc_list.append(avg_roc)
    max_roc_list.append(max_roc)
    roc_scores_list.append(roc_scores)

CPU times: user 6min 12s, sys: 48.3 s, total: 7min
Wall time: 11min 24s


In [80]:
print "Motif Length \t Avg-ROC \t Max-ROC"
for i,r in enumerate(avg_roc_list):
    print "%d \t\t %f \t %f"%(parameter[i],r,max_roc_list[i])

Motif Length 	 Avg-ROC 	 Max-ROC
10 		 0.802907 	 0.986301
15 		 0.683670 	 0.983953
20 		 0.724465 	 0.983733
25 		 0.699172 	 0.982264
30 		 0.588268 	 0.865154


<h3>Experiment 5: Varying Motif Length (Fixed Motives) with MEME</h3>

In [7]:
%%time

parameter = range(5,11)

avg_roc_list=[]
max_roc_list=[]
roc_scores_list=[]

for i in parameter:
    motives=['A'*i,
             'C'*i,
             'A'*i,
             'C'*i]
    
    avg_roc, max_roc, roc_scores = evaluate(motif_finder='meme', # ['meme','eden']
                                            scoring_criteria='hmm', # ['pwm','hmm']
                                            motives=motives,
                                            sequence_length=100,
                                            n_sequences=100,
                                            p=0.10)
    avg_roc_list.append(avg_roc)
    max_roc_list.append(max_roc)
    roc_scores_list.append(roc_scores)

DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
DNA alphabet detected
CPU times: user 6.09 s, sys: 308 ms, total: 6.4 s
Wall time: 1min 35s


In [8]:
print "Motif Length \t Avg-ROC \t Max-ROC"
for i,r in enumerate(avg_roc_list):
    print "%d \t\t %f \t %f"%(parameter[i],r,max_roc_list[i])

Motif Length 	 Avg-ROC 	 Max-ROC
5 		 0.848880 	 0.997396
6 		 0.664851 	 0.861413
7 		 0.734648 	 0.954427
8 		 0.633859 	 0.923913
9 		 0.701966 	 0.924479
10 		 0.802568 	 1.000000
