# Greedy Algorithm for Motif Finding

This is randomized greedy algorithm.

In [1]:
include("utils.jl")
# utils.jl has the align, getprofile, score, and probscore utility functions:

seqs = ["AGTGGATACC", # motif inserted at 4
        "AGGATATAGT", # 2
        "AGGAGGATAT", # 5
        "GCAGCGGATA", # 5
        "CATCAGGATA"] # 6
s = [1, 1, 1, 1, 1]
l = 5;

In [2]:
function motifsearch_random(sequences, l)
    """
    Given a 2D array of strings *sequences*, randomized greedy 
    search for a motif of length *l*.
    Return an array of indices corresponding to the 
    alignment found with the highest score.
    """
    # TODO: Check that l <= sequence length
    
    n = length(sequences[1])
    
    # generate an initial random alignment
    s = rand(1:n-l+1, length(sequences))
    
    # form profile P from s
    P = getprofile(align(sequences, s, l))
    
    bestscore = 0
    newscore = score(P)
    while newscore > bestscore
        bestscore = newscore
        for i = 1:length(sequences)
            # find a P-most probable l-mer from the ith sequence
            a = map(x -> probscore(x, P), [sequences[i][j:j+l-1] for j=1:n-l+1])
            # update the starting index for sᵢ
            s[i] = indmax(a)
            # recalculate the profile
            P = getprofile(align(sequences, s, l))
            newscore = score(P)
        end
    end
    
    return s
end

motifsearch_random (generic function with 1 method)

In [3]:
motif = motifsearch_random(seqs, l)
alignment = align(seqs, motif, l)
profile = getprofile(alignment)
display(motif)
display(alignment)
display(profile)
score(profile)

5-element Array{Int64,1}:
 4
 2
 5
 6
 6

5-element Array{Any,1}:
 "GGATA"
 "GGATA"
 "GGATA"
 "GGATA"
 "GGATA"

4×5 Array{Int32,2}:
 0  0  5  0  5
 0  0  0  5  0
 0  0  0  0  0
 5  5  0  0  0

25