In [10]:
import math

In [51]:
def gen_count_mat(dna_list, pseudo = True):
    ln = len(dna_list[0])
    count_mat = list()
    if pseudo:
        for i in range(ln):
            count_mat.append({"A":1, "C":1, "G":1, "T":1})
    else:
        for i in range(ln):
            count_mat.append({"A":0, "C":0, "G":0, "T":0})
    
    for i in range(len(dna_list[0])):
        for dna in dna_list:
            char = dna[i]
            count_mat[i][char] += 1
    
    return count_mat

def score_kmer(kmer, prof_mat):
    score = 1
    for i in range(len(kmer)):
        score *= prof_mat[i][kmer[i]]
    return score


def gen_prof_mat(dna_list, pseudo = True):
    count_mat = gen_count_mat(dna_list, pseudo)
    prof_mat = list()
    for _, slc_count in enumerate(count_mat):
        tot_sum = sum(slc_count.values())
        slc_prof = dict()
        for key, value in slc_count.items():
            slc_prof[key] = value / tot_sum
        prof_mat.append(slc_prof)
    return prof_mat


def gen_all_gapped_kmers(dna, k1, k2, gmin, gmax):
    ln = len(dna)
    for g in range(gmin, gmax + 1):
        for i in range(ln - k1 - k2 - g + 1):
            yield dna[i : i + k1] + dna[i + g + 1: i + g + k2 + 1], g


def entropy(prof_mat):
    entrpy = 0
    for column in prof_mat:
        for nuc, val in column.items():
            if val != 0:
                entrpy += (val * math.log2(val))
    return -entrpy


def gen_prof_most_prob_gapped_kmer(dna, k1, k2, gmin, gmax, prof_mat):
    most_prob_kmer = dna[:k1 + k2]
    best_score = score_kmer(most_prob_kmer, prof_mat)
    best_g = gmin
    for kmer, g in gen_all_gapped_kmers(dna, k1, k2, gmin, gmax):
        curr_score = score_kmer(kmer, prof_mat)
        if curr_score > best_score:
            most_prob_kmer = kmer
            best_score = curr_score
            best_g = g
    return most_prob_kmer, best_g



# has a bug in storing the best g value, still need to track down
def greedy_gapped_motif_search(dna_list, k1, k2, t, gmin, gmax):
    
    best_motifs = [dna[:k1] + dna[gmin : k2 + gmin] for dna in dna_list]
    best_score = entropy(gen_prof_mat(best_motifs))
    # the g values are going awry, something is wrong
    best_gaps = [gmin for _ in range(len(dna_list))]
    
    for kmer, g in gen_all_gapped_kmers(dna_list[0], k1, k2, gmin, gmax):
        
        gapped_motifs = [kmer[:k1] + kmer[-k2:]]
        
        # could be in the assignment or initialization of the current gaps matrix
        gaps = [gmin for _ in range(len(dna_list))]
        gaps[0] = g
        
        for i in range(1, t):
            gapped_motif_prof = gen_prof_mat(gapped_motifs)
            
            # maybe it's not returning the correct values here?
            most_prob_kmer, most_prob_g = gen_prof_most_prob_gapped_kmer(dna_list[i], k1, k2,
                                                                         gmin, gmax,
                                                                         gapped_motif_prof)
            
            gapped_motifs.append(most_prob_kmer)
            # could be here
            gaps[i] = most_prob_g
            
        curr_score = entropy(gen_prof_mat(gapped_motifs))
        if curr_score < best_score:
            best_score = curr_score
            best_motifs = gapped_motifs
            # probably not here, though could be shallow vs deep copy???
            best_gaps = gaps[:]
    
    return best_motifs, best_gaps


In [52]:
# Expect AGAG with gap lengths of 2, 3, 3, 2 in that order
test_strs = ["ATCGAGCTAGATTTA", "AGTTCAGACACACAC",
             "CAGATAGACGAGTTT", "ATAGACAGATAGTTT"]

k1 = 2
k2 = 2
t = len(test_strs)
gmin = 2
gmax = 5

greedy_gapped_motif_search(test_strs, k1, k2, t, gmin, gmax)

(['AGAG', 'AGAG', 'AGAG', 'AGAG'], [3, 4, 3, 3])

## Heck yeah!
* We're getting the correct results!  That was super straightforward
* Still has a problem with returning the gap lengths, I'll get to that later
* Wait for Chien-Ju to make the test cases and get the length bug working