In [78]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                           random_state=1):
    random.seed(random_state)
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    
    sequence_length = sequence_length / len(motives)
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences

    counter=0
    seqs=[]
    binary_seqs=[]
    for i in range(n_seq_per_motif):
        total_seq = ''
        total_binary_seq=''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
            binary_seq = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
            total_binary_seq += binary_seq
        seqs.append(('>ID%d'%counter,total_seq))
        binary_seqs.append(('>ID%d'%counter,total_binary_seq))
        counter += 1
    return motives, seqs, binary_seqs

In [79]:
from sklearn.cluster import KMeans
from utilities import Weblogo, MuscleAlignWrapper
from eden_wrapper import EdenWrapper
from meme_wrapper import Meme

In [83]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [84]:
def run_tool(motif_finder, n_motives, min_motif_len, max_motif_len):
    if motif_finder=='meme':
        with open('seqs.fa','w') as f_train:
            for seq in seqs:
                f_train.write('>' + seq[0] + ' \n')
                f_train.write(seq[1] + '\n')

        tool =  Meme(alphabet='dna',
                     minw=min_motif_len,
                     maxw=max_motif_len,
                     nmotifs=n_motives)
        tool.fit('seqs.fa')
    else:
        km = KMeans(n_clusters=n_motives)
        tool = EdenWrapper(alphabet='dna', 
                           complexity=5, 
                           nbits=14, 
                           negative_ratio=3,
                           min_subarray_size=min_motif_len, 
                           max_subarray_size=max_motif_len,
                           clustering_algorithm=km)
        tool.fit(seqs)
    return tool

In [85]:
# size of context
# size of motif
# number of motifs
# number of sequences
# 

In [87]:
import numpy as np

def evaluate(motif_finder='meme', # or 'eden'
             scoring_criteria='pwm', # or 'hmm'
             motif_length=6,
             n_motives=4,
             sequence_length=30,
             n_sequences=50,
             p=0.2,
             random_state=8):

    motives, seqs, binary_seqs = make_artificial_dataset(alphabet='ACGT',
                                                         motif_length=motif_length,
                                                         n_motives=n_motives,
                                                         sequence_length=sequence_length,
                                                         n_sequences=n_sequences,
                                                         p=p,
                                                         random_state=random_state)

    tool = run_tool(motif_finder=motif_finder,
                    n_motives=n_motives,
                    min_motif_len=max(1, len(motives[0])-2),
                    max_motif_len=len(motives[0])+2)
    
    scores = []
    for j in range(len(seqs)):
        seq_scr = []
        for k in range(n_motives):
            if scoring_criteria=='pwm':
                scr=tool.score_pwm(motif_num=k+1, seq=seqs[j][1], zero_padding=True)
            else: # scoring_criteria=='hmm'
                scr=tool.score_pwm(motif_num=k+1, seq=seqs[j][1], zero_padding=True)
            seq_scr.append(scr)

        # taking average over all motives for a sequence
        x = np.array(seq_scr[0])
        for k in range(1, n_motives):
            x = np.vstack((x, seq_scr[k]))
        seq_scr = list(np.mean(x, axis=0))
        scores.append(seq_scr)
    
    return scores

In [None]:
%%time
scores = evaluate(motif_finder='eden', # or 'eden'
                  scoring_criteria='hmm', # or 'hmm'
                  motif_length=3,
                  n_motives=2,
                  sequence_length=10,
                  n_sequences=20)


DEBUG:eden.util:Positive data: Instances: 50 ; Features: 16385 with an avg of 1522 features per instance
DEBUG:eden.util:Negative data: Instances: 150 ; Features: 16385 with an avg of 1764 features per instance
DEBUG:eden.util:Elapsed time: 3.6 secs
INFO:eden.motif:model induction: 50 positive instances 4 s


In [None]:
for i in scores:
    print i
    print

In [64]:
a = np.array([1, 2, 3])
b = np.array([2, 3, 4])
c=np.vstack((a,[8,9,0]))
c

array([[1, 2, 3],
       [8, 9, 0]])

In [65]:
list(np.mean(c, axis=0))

[4.5, 5.5, 1.5]

In [17]:
c=np.vstack((c,[7,8,9]))

In [18]:
c

array([[1, 2, 3],
       [2, 3, 4],
       [1, 2, 3],
       [2, 3, 4],
       [7, 8, 9]])

In [77]:
max(1, len([1,2]))

2