#### Problem BA2A: Motif Enumeration
http://rosalind.info/problems/ba2a/

In [40]:
# from ba1g.py import HammingDistance
# from ba1j.py import CreateKmers

def HammingDistance(string1, string2):
    
    "This function calculates the Hamming Distance between two strings of equal length."
    
    # check if strings are the same length
    # alternate: assert len(string1) == len(string2), "Strings must be same length!"
    if len(string1) != len(string2):
        print("Strings must be the same length!")
        
    number_mismatches = 0
    string_length = len(string1)
    for i in range(string_length):
        if string1[i] != string2[i]:
            number_mismatches += 1
    return number_mismatches

def CreateKmers(kmer, d):
    "This function creates all possible kmers in the string Dna of length k"
    new_kmers = []
    if d == 0:
        # why is this in brackets
        return [kmer]
    elif len(kmer) == 1:
        return ['A', 'G', 'C', 'T']
    # generate mismatches
    for neighbor in CreateKmers(kmer[1:], d):
        if HammingDistance(kmer[1:], neighbor) < d:
            new_kmers += ['A' + neighbor, 'C' + neighbor, 'G' + neighbor, 'T' + neighbor]
        else:
            new_kmers += [kmer[0] + neighbor]
    return new_kmers

def MotifEnumeration(k, d, Dna):
    patterns = set()
    first_DNA_string = Dna[0]
    
    # generate all possible kmers from first loop
    for i in range(len(first_DNA_string) - k + 1):
        kmer = first_DNA_string[i : i + k]
        for new_kmers in CreateKmers(kmer, d):
            patterns.add(new_kmers)
    
    # compare first_kmers with kmers in subsequent strings
    for dna in Dna[1:]:
        current_kmers = set()
        
        for p in range(len(dna) - k + 1):
            for pattern_p in patterns:
                if HammingDistance(dna[p : p + k], pattern_p) <= d:
                    current_kmers.add(pattern_p)
    
        patterns = patterns.intersection(current_kmers)
    return patterns

In [41]:
MotifEnumeration(3, 1, ["ATTTGGC", "TGCCTTA", "CGGTATC", "GAAAATT"])

{'ATA', 'ATT', 'GTT', 'TTT'}

#### Problem BA2B: Find a Median String
http://rosalind.info/problems/ba2b/

In [63]:
from itertools import product
import sys

# from ba1g.py import HammingDistance

def HammingDistance(string1, string2):
    
    "This function calculates the Hamming Distance between two strings of equal length."
    
    # check if strings are the same length
    # alternate: assert len(string1) == len(string2), "Strings must be same length!"
    if len(string1) != len(string2):
        print("Strings must be the same length!")
        
    number_mismatches = 0
    string_length = len(string1)
    for i in range(string_length):
        if string1[i] != string2[i]:
            number_mismatches += 1
    return number_mismatches

def Distance(pattern, Dna):
    "This function finds the summed distances of a kmer pattern over multiple DNA strings"
    distances = []
    k = len(pattern)
    for Dna_line in Dna:
        # set current_min to large number
        current_min = sys.maxsize
        for i in range(len(Dna_line) - k + 1):
            # find minimum between current min and the HammingDistance of the pattern and slice of Dna_line
            current_min = min(HammingDistance(pattern, Dna_line[i : i + k]), current_min)
        distances.append(current_min)
    
    # return the sum of distances for each line for given kmer
    return sum(distances)

def FindMedianString(k, Dna):
    # set distance value to high number
    distance = sys.maxsize
    # generate all possible kmers of length k, iterate through all to find distance sum
    for pattern in product("ACGT", repeat = k):
        # join output of product (a list) into string
        pattern = "".join(pattern)
        # check distances for each pattern with distance function
        d = Distance(pattern, Dna)
        # compare each kmer and if it has a smaller distance sum than current value, redefine variables
        if d < distance:
            distance = d
            median = pattern
    return median

In [64]:
FindMedianString(3, ["AAATTGACGCAT", "GACGACCACGTT", "CGTCAGCGCCTG", "GCTGAGCACCGG", "AGTACGGGACAG"])

'ACG'

#### Problem BA2C: Find Profile-most Probable kmer in a String
http://rosalind.info/problems/ba2c/

In [72]:
def ProfileMostProbable(Dna, k, Profile):
    kmer_probability_max = 0
    max_kmer = ""
    
    for i in range(len(Dna) - k + 1):
        kmer = Dna[i : i + k]
        # kmer_probabilty must be set to 1 because it will be multiplied by subsequent probs
        kmer_probability = 1
        # iterate throuh kmers and multiple probabilities from Profile dictionaries
        for i, basepair in enumerate(kmer):
            # index through dictionary for value i in key basepair
            probability = profile[basepair][i]
            # multiple probability by existing probability_value
            kmer_probability *= probability
        
        # check if kmer_probability is largest value seen so far, replace variable if so
        if kmer_probability > kmer_probability_max:
            kmer_probability_max = kmer_probability
            max_kmer = kmer
            
    return max_kmer

In [73]:
DNA = "ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT"
k = 5
profile = { 'A' : [0.2, 0.2, 0.3, 0.2, 0.3],
            'C' : [0.4, 0.3, 0.1, 0.5, 0.1],
            'G' : [0.3, 0.3, 0.5, 0.2, 0.4],
            'T' : [0.1, 0.2, 0.1, 0.1, 0.2]
        }

In [74]:
ProfileMostProbable(DNA, k, profile)

'CCGAG'

#### Problem BA2D: Implement GreedyMotifSearch
http://rosalind.info/problems/ba2d/

In [None]:
def GreedyMotifSeartch(k, t, Dna):
    
    # Step 1: Initialize best_motifs with first kmer from each string
    # best_motifs is a list of kmers length k from each Dna string
    
    # Step 2: (Outer Loop) Iterate through first Dna string for each kmer length k
    
        # Step 3a: (Inner Loop) For each kmer from outer loop, create a profile matrix of kmers in subsequent strings
        # Step 3b. Find the most Profile-most Probable motif in Dna string being iterated on
        # Step 3c. Add resulting motif to motif list
        # Continue this process for subsequent Dna_string lines
        
    # Step 4: Compare current motif with best_motif