# Greedy Algorithm for Motif Finding

This is an approximate greedy algorithm. Although the approximation ratio is unknown, it often performs well in practice, and is quite simple.

In [9]:
include("utils.jl")
# utils.jl has the align, getprofile, and score utility functions:

seqs = ["AGTGGATACC", # motif inserted at 4
        "AGGATATAGT", # 2
        "AGGAGGATAT", # 5
        "GCAGCGGATA", # 5
        "CATCAGGATA"] # 6
s = [1, 1, 1, 1, 1]
l = 5;

In [10]:
function motifsearch_greedy(sequences, l)
    """
    Given a 2D array of strings *sequences*, greedy 
    search for a motif of length *l*.
    Return an array of indices corresponding to the 
    alignment found with the highest score.
    """
    # TODO: Check that l <= sequence length
    bestmotif = ones(Int32, 2)
    
    for s₁ = 1:length(sequences[1])-l+1
        for s₂ = 1:length(sequences[2])-l+1
            # find best partial alignment of first 2 sequences 
            if score(sequences, [s₁, s₂], l) > score(sequences, bestmotif, l)
                bestmotif = [s₁, s₂]
            end
        end
    end
    s = copy(bestmotif)
    
    #iterate through the rest of the sequences
    for i = 3:length(sequences)
        # extend the default best motif
        push!(bestmotif, 1)
        for sᵢ = 1:length(sequences[i])-l+1
            if score(sequences, [s..., sᵢ], l) > score(sequences, bestmotif, l)
                # update the best motif
                bestmotif[i] = sᵢ
            end
        end
        push!(s, bestmotif[i])
    end
    return bestmotif
end

motifsearch_greedy (generic function with 1 method)

In [11]:
motif = motifsearch_greedy(seqs, l)
alignment = align(seqs, motif, l)
profile = getprofile(alignment)
display(motif)
display(alignment)
display(profile)
score(profile)

5-element Array{Int64,1}:
 4
 2
 5
 6
 6

5-element Array{Any,1}:
 "GGATA"
 "GGATA"
 "GGATA"
 "GGATA"
 "GGATA"

4×5 Array{Int32,2}:
 0  0  5  0  5
 0  0  0  5  0
 0  0  0  0  0
 5  5  0  0  0

25