In [24]:
def parse_genome(file_path):
    sequences = []
    seq = ""
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if not line.startswith('>'):
                seq += line
            else:
                if seq:
                    sequences.append(seq)
                seq = ""
    if seq:
        sequences.append(seq)
    return sequences

def hamming_distance(s1, s2):
    return sum(1 for x, y in zip(s1, s2) if x != y)

def get_k_mers(K):
    nt = ['A', 'T', 'C', 'G']
    all_mers = set(['A' * K])
    for d in range(K):
        all_mers_cur = all_mers.copy()
        for string in all_mers_cur:
            for i in range(len(string)):
                cur_str = string[0:]
                for j in nt:
                    if string[i] != j:
                        all_mers.add(cur_str[0:i] + j + cur_str[i + 1:])
    return sorted(list(all_mers))

def calculate_score(motifs, k):
    score = 0
    consensus = ""
    for i in range(k):
        column = [motif[i] for motif in motifs]
        counts = {nt: column.count(nt) for nt in 'ATCG'}
        max_nt = max(counts, key=counts.get)
        consensus += max_nt
        score += len(column) - counts[max_nt]
    return score, consensus

def find_best_motifs(dna, k):
    best_score = float('inf')
    best_consensus = None
    best_motifs = None
    all_kmers = get_k_mers(k)

    for kmer in all_kmers:
        motifs = []
        for sequence in dna:
            best_motif = None
            min_distance = float('inf')
            for i in range(len(sequence) - k + 1):
                motif = sequence[i:i + k]
                distance = hamming_distance(motif, kmer)
                if distance < min_distance:
                    min_distance = distance
                    best_motif = motif
            motifs.append(best_motif)
        score, consensus = calculate_score(motifs, k)
        if score < best_score:
            best_score = score
            best_consensus = consensus
            best_motifs = motifs

    return best_score, best_consensus, best_motifs

# Execution
file_path = "/content/homework05_data.txt"
k = 6

# Parse genome sequences
dna = parse_genome(file_path)

# Find best motifs
best_score, best_consensus, best_motifs = find_best_motifs(dna, k)

# Print results
print(f"Best score: {best_score}")
print(f"Consensus motif: {best_consensus}")
print(f"Best motifs: {best_motifs}")


Best score: 28
Consensus motif: AAAAAT
Best motifs: ['AAAAAT', 'AAAAAG', 'AAAACT', 'ATAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'ACAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AATAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAA', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAACT', 'CAAAAT', 'GAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'TAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'CAAAAT', 'AAAAAT', 'AAAAAT', 'AAGAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'ACAAAT', 'AAAAAT', 'CAAAAT', 'AAAAAC', 'GAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'ACAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AATAAT', 'AAAAAT', 'GAAAAT', 'AAAAAT', 'AAAAAT', 'GAAAAT', 'AAAAAA', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAGAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'CAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'CAAAAT', 'AAAAAT', 'CAAAAT', 'AACAAT', 'TAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT', 'AAAAAT