# Greedy Algorithm for Motif Finding

This is an approximate greedy algorithm. Although the approximation ratio is unknown, it often performs well in practice, and is quite simple.

In [15]:
# DNA maps nucleotide letters to array indices
DNA = Dict("A" => 1, "T" => 2, "C" => 3, "G" => 4)

function score(sequences, s, l)
   """
    Return the integer score of the alignment of
    substrings in *sequences* of length l.
    If length(s) < length(sequences), then
    this will be the score of a partial alignment.
    """
    profile = zeros(Int32, 4, l)
    for i = 1:size(s, 1)
       for j = 0:l-1
            # for each sequence, count occurrences
            nucl = sequences[i][[s[i]+j]]
            profile[DNA[nucl], j+1] += 1
        end
    end
    # sum the maximum values for each position, return
    score = sum(maximum(profile, 1))
end

score (generic function with 1 method)

In [18]:
seqs = ["AGT", 
        "ATA", 
        "AGG",
        "GCA"]
score(seqs, (1, 1, 1, 1), 3)

7

In [None]:
function motifsearch_greedy(sequences, l)
    """
    Given a 2D array of strings *sequences*, greedy 
    search for a motif of length *l*.
    Return an array of indices corresponding to the 
    alignment found with the highest score.
    """
    # TODO: Check that l <= sequence length
    bestmotif = ones(Int32, 2)
    
    for s₁ = 1:length(sequences[1])-l+1
        for s₂ = 1:length(sequences[2])-l+1
            # find best partial alignment of first 2 sequences 
            if score(sequences, [s₁, s₂], l) > score(sequences, bestmotif, l)
                 bestmotif = [s₁, s₂]
            end
        end
    end
    s = bestmotif
    
    #iterate through the rest of the sequences
    for i = 3:length(sequences)
        # extend the default best motif
        push!(bestmotif, 1)
        for sᵢ = 1:length(sequences[i])
            if score(sequences, [s..., sᵢ], l) > score(sequences, bestmotif, l)
                # update the best motif
                bestmotif[i] = sᵢ
            end
        end
        s = bestmotif
    end
    return bestmotif
end