In [1]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                           random_state=1):
    random.seed(random_state)
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    
    sequence_length = sequence_length / len(motives)
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        total_seq = ''
        total_binary_seq=''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
        seqs.append(('>ID%d'%counter,total_seq))
        counter += 1
    binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
    binary_seq = binary_skeleton * n_motives
    return motives, seqs, binary_seq

In [2]:
from sklearn.cluster import KMeans
from utilities import Weblogo, MuscleAlignWrapper
from eden_wrapper import EdenWrapper
from meme_wrapper import Meme

In [3]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [4]:
def run_tool(motif_finder,seqs, n_motives, min_motif_len, max_motif_len):
    if motif_finder=='meme':
        with open('seqs.fa','w') as f_train:
            for seq in seqs:
                f_train.write('>' + seq[0] + ' \n')
                f_train.write(seq[1] + '\n')

        tool =  Meme(alphabet='dna',
                     minw=min_motif_len,
                     maxw=max_motif_len,
                     nmotifs=n_motives)
        tool.fit('seqs.fa')
    else:
        km = KMeans(n_clusters=n_motives)
        tool = EdenWrapper(alphabet='dna', 
                           complexity=5, 
                           nbits=14, 
                           negative_ratio=3,
                           min_subarray_size=min_motif_len, 
                           max_subarray_size=max_motif_len,
                           clustering_algorithm=km)
        tool.fit(seqs)
    return tool

In [5]:
def score_seqs(seqs, n_motives, scoring_criteria, tool):
    scores = []
    for j in range(len(seqs)):
        seq_scr = []
        for k in range(n_motives):
            if scoring_criteria=='pwm':
                scr=tool.score_pwm(motif_num=k+1, seq=seqs[j][1], zero_padding=True)
            else: # scoring_criteria=='hmm'
                scr=tool.score_pwm(motif_num=k+1, seq=seqs[j][1], zero_padding=True)
            seq_scr.append(scr)

        # taking average over all motives for a sequence
        x = np.array(seq_scr[0])
        for k in range(1, n_motives):
            x = np.vstack((x, seq_scr[k]))
        seq_scr = list(np.mean(x, axis=0))
        scores.append(seq_scr)
    return scores

In [6]:
# size of context
# size of motif
# number of motifs
# number of sequences
# 

In [7]:
import numpy as np
from sklearn.metrics import roc_auc_score

def evaluate(motif_finder='meme', # ['meme','eden']
             scoring_criteria='pwm', # ['meme','hmm']
             motif_length=6,
             n_motives=4,
             sequence_length=30,
             n_sequences=50,
             p=0.2,
             random_state=8):

    motives, seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
                                                         motif_length=motif_length,
                                                         n_motives=n_motives,
                                                         sequence_length=sequence_length,
                                                         n_sequences=n_sequences,
                                                         p=p,
                                                         random_state=random_state)

    tool = run_tool(motif_finder=motif_finder,
                    seqs=seqs,
                    n_motives=n_motives,
                    min_motif_len=max(2, len(motives[0])-2),
                    max_motif_len=len(motives[0])+2)
    
    scores = score_seqs(seqs=seqs,
                        n_motives=n_motives,
                        scoring_criteria=scoring_criteria,
                        tool=tool)
    roc_scores = []
    true_score = [float(int(x)) for x in binary_seq]
    for score in scores:
        roc_scores.append(roc_auc_score(true_score, score))
    avg_rocr = np.average(roc_scores)
    return avg_roc, roc_scores

In [8]:
%%time
avg_roc, roc_scores = evaluate(motif_finder='eden', # ['meme','eden']
                  scoring_criteria='hmm', # ['pwm','hmm']
                  motif_length=10,
                  n_motives=4,
                  sequence_length=100,
                  n_sequences=100)


DEBUG:eden.util:Positive data: Instances: 100 ; Features: 16385 with an avg of 2434 features per instance
DEBUG:eden.util:Negative data: Instances: 300 ; Features: 16385 with an avg of 2434 features per instance
DEBUG:eden.util:Elapsed time: 14.1 secs
INFO:eden.motif:model induction: 100 positive instances 14 s
INFO:eden.motif:motives extraction: 128 motives in 6s
INFO:eden.motif:motives clustering: 4 clusters in 0s
INFO:eden.motif:after filtering: 110 motives 4 clusters in 0s
INFO:eden.motif:motif model construction in 0s
INFO:eden.motif:updated motif counts in 0s


NameError: global name 'avg_roc' is not defined

In [None]:
print avg_roc
print roc_scores

In [19]:
%%time
roc_list=[]
roc_scores_list=[]

motif_len = range(10,21)

for i in motif_len:
    avg_roc, roc_scores = evaluate(motif_finder='eden', # ['meme','eden']
                          scoring_criteria='hmm', # ['pwm','hmm']
                          motif_length=i,
                          n_motives=4,
                          sequence_length=100,
                          n_sequences=100)
    roc_list.append(avg_roc)
    roc_scores_list.append(roc_scores)

DEBUG:eden.util:Positive data: Instances: 100 ; Features: 16385 with an avg of 2434 features per instance
DEBUG:eden.util:Negative data: Instances: 300 ; Features: 16385 with an avg of 2434 features per instance
DEBUG:eden.util:Elapsed time: 14.4 secs
INFO:eden.motif:model induction: 100 positive instances 15 s
INFO:eden.motif:motives extraction: 114 motives in 4s
INFO:eden.motif:motives clustering: 4 clusters in 0s
INFO:eden.motif:after filtering: 102 motives 4 clusters in 0s
INFO:eden.motif:motif model construction in 0s
INFO:eden.motif:updated motif counts in 0s
DEBUG:eden.util:Positive data: Instances: 100 ; Features: 16385 with an avg of 2527 features per instance
DEBUG:eden.util:Negative data: Instances: 300 ; Features: 16385 with an avg of 2527 features per instance
DEBUG:eden.util:Elapsed time: 18.2 secs
INFO:eden.motif:model induction: 100 positive instances 19 s
INFO:eden.motif:motives extraction: 83 motives in 4s
INFO:eden.motif:motives clustering: 4 clusters in 1s
INFO:eden

CPU times: user 3min 26s, sys: 41.5 s, total: 4min 7s
Wall time: 5min 30s


In [20]:
for i,r in enumerate(roc_list):
    print "Motif Length: %d, ROC: %f"%(motif_len[i],r)

Motif Length: 10, ROC: 0.499375
Motif Length: 11, ROC: 0.499018
Motif Length: 12, ROC: 0.499688
Motif Length: 13, ROC: 0.498013
Motif Length: 14, ROC: 0.499750
Motif Length: 15, ROC: 0.500000
Motif Length: 16, ROC: 0.499844
Motif Length: 17, ROC: 0.499439
Motif Length: 18, ROC: 0.500000
Motif Length: 19, ROC: 0.500000
Motif Length: 20, ROC: 0.498812
