## Randomized Algorithms
While the previous definition of a profile matrix was that constructred by a collection of k-Motifs in DNA. For RAs, we need to define the k-Motifs as a function of an arbitrary profile matrix and DNA strings. 

We find the k-Motifs by using the profile matrix for each possible k-mer and selectnig the one that maximizes the score/most conserved motif. 

This can also repeat for the next motif selection: *Motifs(Profile(Motifs), DNA)* After this, we should compute the profile of these motifs hoping that this improved motif could better help select a motif. *Profile(Motifs(Profile(Motifs), DNA))*

This process repeats -> this is what RandomizedMotifSearch does. The randomized element comes from selecting the initial collection of k-mers that form the profile matrix. 


In [1241]:
import random
import numpy as np

def newneighbours(text, k):
    return [text[i:i+k] for i in range(len(text) - (k-1))]

def entropy_calc(counts):
    base = np.nan_to_num(counts*np.log2((counts)))
    sums = [-1 * sum(base[i]) for i in range(len(base))]
    return sums
    
def text_counts(text, k, laplace=True): #after rewriting the count and profile method, I noticed the main flaw in the profile_dict calculations
    if laplace == True:
        init = 1
    else:
        init = 0

    count_dict = {key: [init] * k for key in ['A', 'C', 'G', 'T']}

    for row in text:
        row = str(row)
        for idx, nuc in enumerate(row):
            count_dict[nuc][idx] += 1

    profile_dict = {key: [i/(k-1+(init*len(text))) for i in value] for key, value in count_dict.items()} #here, we need to divide by the len(text) or t + k as it matches the laplace-init
    di = list(count_dict.values())
    
    return count_dict, profile_dict, entropy_calc(np.asanyarray(di).transpose())

def RandomInit(text, k, t):
    all_patterns = [newneighbours(line, k) for line in text]
    rand_idx = [random.randint(0, len(all_patterns[0])-1) for _ in range(t)]
    
    init_patterns = []
    for idx, line in enumerate(all_patterns):
       init_patterns.append(line[rand_idx[idx]])
    
    return init_patterns

def ScorePattern(profile, pattern):
    prod = 1
    for idx, nuc in enumerate(pattern):
        if nuc == '[' or nuc == ']':
            pass
        else:
            prod *= profile[nuc][idx]
    return [pattern, prod]
    

def ProfileProbable(profile, text, k):
    most_probables = []
    for line in text:
        line_neighbours = newneighbours(line, k)
        all_prods = dict(list(map(ScorePattern, [profile for _ in range(len(line_neighbours))], line_neighbours)))
        vals = list(all_prods.values())
        most_probables.append([[key for key in all_prods.keys() if all_prods[key] == max(vals)], max(vals)])
    
    return most_probables


def RandomizedMotifSearch(DNA, k, t):
    rand_init = RandomInit(DNA, k, t)
    best_motifs = rand_init


    while True:
        _, profile, _ = text_counts(best_motifs, k, laplace=True)
        most_prob = ProfileProbable(profile, DNA, k)
        init_prob = [ScorePattern(profile, pattern) for pattern in best_motifs]
        
        if sum([i[1] for i in most_prob]) > sum([i[1] for i in init_prob]):
            best_motifs = [i[0][0] for i in most_prob]
        else:
            return [best_motifs, sum([i[1] for i in most_prob])]
            break



In [1242]:
betters = []
DNA = ['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA', 'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG', 'TAGTACCGAGACCGAAAGAAGTATACAGGCGT', 'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC', 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA']
k = 8
t = len(DNA)

for _ in range(1000):
    betters.append(RandomizedMotifSearch(DNA, k, t))

scores = [i[1] for i in betters]
main_idx = scores.index(max(scores))
print(betters[main_idx])

[['AACGGCCA', 'AAGTGCCA', 'TAGTACCG', 'AAGTTTCA', 'ACGTGCAA'], 0.0011628401968068894]


In [1244]:
with open('dataset_161_5 (8).txt', 'r') as sqinput:
    sqinput = sqinput.read().splitlines()
    params = [int(i) for i in sqinput[0].split(' ')]


betters = []
DNA = sqinput[1:]
k = params[0]
t = params[1]


for i in range(1000):
    betters.append(RandomizedMotifSearch(DNA, k, t))

scores = [i[1] for i in betters]
main_idx = scores.index(max(scores))
print(betters[main_idx])

In [1031]:
better = betters[main_idx][0]
for i in better:
    print(i)

CGTTTTTACTATCAT
CGTCGACGTCAATAT
CGTTCCCGTCATATT
GCTTCCCGTCAATAG
CGTTCCGAACAATAT
CGTTCCCGCAGATAT
CGTTCGGATCAATAT
CAACCCCGTCAATAT
GGTTCCCGTCAATCA
CGTTCCCGTGTTTAT
CGTTCCCCATAATAT
CGTTCCGTCCAATAT
CGTTTGGGTCAATAT
CGTTCGTCTCAATAT
CGTGTTCGTCAATAT
TTGTCCCGTCAATAT
CGCCGCCGTCAATAT
CGTTGTGGTCAATAT
CGTTCCCGTCAACCC
CGTTCCCGTCGTAAT


In [1245]:


# rand_init = RandomInit(DNA, k, t)
# best_motifs = rand_init
# _, profile, _ = text_counts(best_motifs, k, laplace=True)
# most_prob = ProfileProbable(profile, DNA, k)
# init_prob = [ScorePattern(profile, pattern) for pattern in best_motifs]

# # print(sum([i[1] for i in most_prob]), sum([i[1] for i in init_prob]))

# if sum([i[1] for i in most_prob]) > sum([i[1] for i in init_prob]):
#     best_motifs = [i[0][0] for i in most_prob]
# else:
#     print(best_motifs, sum([i[1] for i in most_prob]))

# # print(profile)
# print(best_motifs)

# _, profile, _ = text_counts(best_motifs, k, laplace=True)
# # print(profile)
# most_prob = ProfileProbable(profile, DNA, k)
# init_prob = [ScorePattern(profile, pattern) for pattern in best_motifs]

# # print(sum([i[1] for i in most_prob]), sum([i[1] for i in init_prob]))

# print(most_prob)

In [1358]:
import random
randomer = lambda t: [random.random() for _ in range(t)]
gibbs = lambda x: np.true_divide(np.array(randomer(x)), sum(np.array(randomer(x))))

gibbs(2)

def WeightedProbabilities(all_probs):
    summed = sum(list(all_probs.values()))
    
    for idx, (key, val) in enumerate(all_probs.items()):
        all_probs[key] = float(val/summed)
    
    return all_probs

def NewScore(motifs):
    score = 0
    consensus = ConcensusString(motifs)
    for i in range(len(motifs)):
        score += NewHamming(consensus,motifs[i])
    return score

def ConcensusString(motifs):
    k = len(motifs[0])
    count = NewCount(motifs)
    consensus = ""
    for j in range(k):
        M = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > M:
                M = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol
    return consensus

def NewHamming(p, q):
    count = 0
    L = len(p)
    for i in range(L):
        if p[i] != q[i]:
            count += 1
    return count

def NewCount(motifs):
    count = {}
    k = len(motifs[0])
    for symbol in "ACGT":
        count[symbol] = [] #Genero una lista para cada nucleotido en el set count
        for j in range(k):
            count[symbol].append(1) #a cada uno le pongo un 0
    t = len(motifs)
    for i in range(t):
        for j in range(k):
            symbol = motifs[i][j] #para el simbolo de esa posicion del motivo
            count[symbol][j] += 1 #sumarle un 1 al set count en ese lugar
    return count

def BetterProbable(profile, line, k):
    most_probables = []

    line_neighbours = newneighbours(line, k)
    all_prods = dict(list(map(ScorePattern, [profile for _ in range(len(line_neighbours))], line_neighbours)))
    all_probs = WeightedProbabilities(all_prods)
    vals = list(all_probs.values())

    most_probables.append([[key for key in all_probs.keys() if all_probs[key] == max(vals)], max(vals)])

    
    return most_probables, all_probs

def GibbsSampler(DNA, k, t, N):
    rand_init = RandomInit(DNA, k, t)
    best_motifs = rand_init
    
    for _ in range(N):
        remove_line_idx = random.randint(0, t-1)

        counts, profile_removed, _ = text_counts(best_motifs[:remove_line_idx] + best_motifs[remove_line_idx+1:], k, laplace=True)
        most_probable, all_prods = BetterProbable(profile_removed, DNA[remove_line_idx], k)
        final_appender = best_motifs[:]
        # print(most_probable)
        final_appender[remove_line_idx] = most_probable[0][0][0]
        
        final_score = NewScore(final_appender)
        current_best_score = NewScore(best_motifs)



        if final_score < current_best_score:
            best_motifs = final_appender
        else:
            return best_motifs



In [1363]:
with open('dataset_163_4 (13).txt', 'r') as sqinput:
    sqinput = sqinput.read().splitlines()
    params = [int(i) for i in sqinput[0].split(' ')]

DNA = sqinput[1:]
k = params[0]
t = params[1]
N = params[2]

N = 500

# DNA = ['TTACCTTAAC', 'GATGTCTGTC', 'CCGGCGTTAG', 'CACTAACGAG', 'CGTCAGAGGT']
# k = 4
# t = len(DNA)
# N = 100

# GibbsSampler(DNA, k, t, N)


# DNA = ['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA', 'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG', 'TAGTACCGAGACCGAAAGAAGTATACAGGCGT', 'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC','AATCCACCAGCTCCACGTGCAATGTTGGCCTA']
# k = 8
# t = len(DNA)
# N = 1000

# GibbsSampler(DNA, k, t, N)



In [1371]:

bestMotifs = []
bestScore = 300


for start in range(30):
    bMotifs = GibbsSampler(DNA,k,t,N)
    if NewScore(bMotifs) < bestScore:
        bestScore = NewScore(bMotifs)
        bestMotifs = bMotifs

for i in bestMotifs:
    print(i)
print(bestScore)

AGCGCTGTGAGTGAG
AGATCAGTGTGCAAG
AATGCTACAGGTTTC
TGCCCGGTCGCTAGG
TAGGCTACCATTATG
AAGACTGTGAATTTG
AGATCTGCATAAAGG
AACCCGAAATCTATG
GGGCCTGTGATTAAG
AGTAGGTTAACCCAG
AAACCTGCATGAATG
TTGGATGAAAGTATG
ACCGCGGCGAAAATC
CACGCTATGTGAAAC
AAATCGGCCACACAT
AACCATGTTTGTTTC
CCGCCGGTATCTCAC
AAATCAGAGAATTAG
TTGGCGAACTTTTCG
ATCGCGGAAAGTCCG
142


In [1105]:
0.0005119336433572914

0.0005119336433572914