In [1]:
import numpy as np
import os
import time
from Bio import SeqIO #pip install biopython
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from copy import deepcopy


def count_acgt(candidates, ML):
    count = {'A': [],'C': [], 'G': [], 'T': []}
    # print(len(candidates))
    for j in range(ML):
        A_count, C_count, G_count, T_count = 0, 0, 0, 0
        for substr in candidates:
            if substr[j] == 'A':
                A_count += 1
            elif substr[j] == 'C':
                C_count += 1
            elif substr[j] == 'G':
                G_count += 1
            elif substr[j] == 'T':
                T_count += 1
        count['A'].append(A_count)
        count['C'].append(C_count)
        count['G'].append(G_count)
        count['T'].append(T_count)
    return count

def PWM(count):
    temp_list = np.array([count[c] for c in count.keys()])
    numOfSeqs = sum(temp_list[:,0])
    pwm = np.zeros((len(count['A']), 4))
    pwm[:,0] = np.array(count['A'])/numOfSeqs
    pwm[:,1] = np.array(count['C'])/numOfSeqs
    pwm[:,2] = np.array(count['G'])/numOfSeqs
    pwm[:,3] = np.array(count['T'])/numOfSeqs
    return pwm


def calcIC(PWM):
    IC = 0
    for i in range(len(PWM)):
        for j in range(4):
            temp = PWM[i,j]
            if temp > 0:
                IC += temp*np.log2(4*temp)
    return IC


def seqSearch(seq, ML, count):
    # IC_dict = {} #pos:IC
    bestIC = 0
    bestPos = 0
    for i in range(len(seq) - ML+1):
        temp_count = deepcopy(count)
        substr = seq[i: i+ML]
        for j in range(len(substr)):
            temp_count[substr[j]][j] += 1
        IC = calcIC(PWM(temp_count))
        if IC > bestIC:
            bestIC = IC
            bestPos = i
        # IC_dict[i] = IC
    # IC_dict = [v[0] for v in sorted(IC_dict.items(),key = lambda kv: (-kv[1], kv[0]))]
    return bestIC, bestPos

In [2]:
def GreedyMotifSearch(seqs, ML):
    order = np.arange(len(seqs))
    np.random.shuffle(order)
    init_ICs = [] #IC:pos
    for i in range(len(seqs[0])-ML+1):
        str1 = seqs[order[0]][i:i+ML]
        for j in range(len(seqs[0])-ML+1):
            str2 = seqs[order[1]][j:j+ML]
            init_candidates = [str1, str2]
            count = count_acgt(init_candidates, ML)
            pwm = PWM(count)
            IC = calcIC(pwm)
            init_ICs.append((IC, [i,j]))
    # x = [v[0] for v in sorted(init_ICs, key = lambda kv: (-kv[0], kv[1]))][:50]
    # print(x[:20])
    init_pos = [v[1] for v in sorted(init_ICs, key = lambda kv: (-kv[0], kv[1]))][:50]
    bestIC = 0
    bestpwm = None
    temp_pos = None
    # bestPos = np.zeros(len(seqs))
    for idx in range(len(init_pos)):
        # print('Finding', idx)
        i, j = init_pos[idx][0], init_pos[idx][1]
        pos_list = [i,j]
        candidate = [seqs[order[0]][i:i+ML], seqs[order[1]][j:j+ML]]
        count = count_acgt(candidate, ML)
        for a in range(2, len(seqs)):
            _, pos = seqSearch(seqs[order[a]], ML, count)
            pos_list.append(pos)
            candidate.append(seqs[order[a]][pos:pos+ML])
            count = count_acgt(candidate, ML)
        pwm = PWM(count)
        IC = calcIC(pwm)
        if IC > bestIC:
            bestIC = IC
            bestpwm = pwm
            temp_pos = pos_list
    bestPos = []
    for i in np.argsort(order):
        bestPos.append(temp_pos[i])
    return bestpwm, bestPos

def multipleGreedy(seqs, ML, N, index):
    bestIC = 0
    bestpwm = None
    bestPos = None
    for _ in range(N):
        pwm, pos = GreedyMotifSearch(seqs, ML)
        IC = calcIC(pwm)
        if IC > bestIC:
            bestIC = IC
            bestpwm = pwm
            bestPos = pos
    write(bestpwm, bestPos, index)

def write(pwm, pos, index):
    location = 'data_set_' + str(index)
    f = open(os.path.join(location, 'predicted_motif' + ".txt"), 'w+')
    f.write('MOTIF' + str(index) + ' ' + str(ML))
    f.write('\n')
    for i in pwm:
        for j in i:
            f.write(str(j))
            f.write(',')
        f.write('\n')
    f.close()
    
    f = open(os.path.join(location, 'predicted_sites' + ".txt"), 'w+')
    for i in pos:
        f.write(str(i))
        f.write(',')
    f.close()

In [3]:
def evaluation(index, start, end):
    location = 'data_set_' + str(index) + '/' 
    with open(os.path.join(location,'motiflength.txt'), 'r') as f:
        ML = int(f.read())
    with open(os.path.join(location,'motif.txt'), 'r') as f:
        x = f.read().strip().split('\n')
        y = [i.strip().split(',')[:-1] for i in x][1:]
        motif = [[float(j) for j in i] for i in y]
    with open(os.path.join(location,'predicted_motif.txt'), 'r') as f:
        x = f.read().strip().split('\n')
        y = [i.strip().split(',')[:-1] for i in x][1:]
        predicted_motif = [[float(j) for j in i] for i in y]
    with open(os.path.join(location,'sites.txt'), 'r') as f:
        sites = [int(i) for i in f.read().strip().split(',')[:-1]]
    with open(os.path.join(location,'predicted_sites.txt'), 'r') as f:
        predicted_sites = [int(i) for i in f.read().strip().split(',')[:-1]]
    #1)
    def KL(a, b):
        epsilon = 1e-3
        a = np.asarray(a, dtype=np.float)
        b = np.asarray(b, dtype=np.float)
        count_zero = np.count_nonzero(b == 0)
        count_nonzero = np.count_nonzero(b > 0)
        ratio = count_zero/count_nonzero
        for i in range(len(b)):
            if b[i] == 0:
                b[i] = epsilon
            else:
                b[i] = b[i] - ratio*epsilon
        return np.sum(np.where(a != 0, a * np.log(a / b), 0))

    a = sum([KL(motif[i], predicted_motif[i]) for i in range(len(motif))])
    #2)
    b = 0
    for i in range(len(sites)):
        x = range(sites[i], sites[i] + ML)
        y = range(predicted_sites[i], predicted_sites[i] + ML)
        b += len(set(x) & set(y))
    #3)
    c = sum([1 for i in range(len(sites)) if abs(sites[i] - predicted_sites[i]) < len(motif)/2])
    #4)
    d = end - start
    
    return a, b, c, d

In [4]:
results = []
for i in range(70):
    location = 'data_set_' + str(i) + '/' 
    seqs = []
    for record in SeqIO.parse(os.path.join(location, 'sequences' + ".fasta"), "fasta"):
        seqs.append(str(record.seq))
    with open(os.path.join(location,'motiflength.txt'), 'r') as f:
        ML = int(f.read())
    f.close()
    start = time.time()
    multipleGreedy(seqs, ML, 3, i)
    end = time.time()
    results.append(evaluation(i, start, end))
    print("index: %s, kl: %s, overlap pos: %s, overlap site: %s, time: %s sec" 
          %(str(i),str(np.round(results[i][0], 3)),str(np.round(results[i][1], 3)),str(np.round(results[i][2], 3)),str(np.round(results[i][3], 3))))

index: 0, kl: 27.767, overlap pos: 3, overlap site: 0, time: 255.592 sec
index: 1, kl: 27.147, overlap pos: 1, overlap site: 0, time: 194.208 sec
index: 2, kl: 5.36, overlap pos: 63, overlap site: 9, time: 206.495 sec
index: 3, kl: 29.135, overlap pos: 0, overlap site: 0, time: 233.172 sec
index: 4, kl: 25.716, overlap pos: 0, overlap site: 0, time: 182.539 sec
index: 5, kl: 28.826, overlap pos: 4, overlap site: 1, time: 186.921 sec
index: 6, kl: 21.418, overlap pos: 12, overlap site: 2, time: 162.497 sec
index: 7, kl: 23.388, overlap pos: 0, overlap site: 0, time: 194.009 sec
index: 8, kl: 9.021, overlap pos: 35, overlap site: 5, time: 210.353 sec
index: 9, kl: 26.494, overlap pos: 0, overlap site: 0, time: 178.893 sec
index: 10, kl: 43.615, overlap pos: 0, overlap site: 0, time: 154.5 sec
index: 11, kl: 1.836, overlap pos: 49, overlap site: 7, time: 166.989 sec
index: 12, kl: 25.959, overlap pos: 0, overlap site: 0, time: 208.504 sec
index: 13, kl: 2.341, overlap pos: 63, overlap sit

In [5]:
for i, r in enumerate(results):
    print("index: %s, kl: %s, overlap pos: %s, overlap site: %s, time: %s sec" 
          %(str(i),str(np.round(r[0], 3)),str(np.round(r[1], 3)),str(np.round(r[2], 3)),str(np.round(r[3], 3))))

index: 0, kl: 27.767, overlap pos: 3, overlap site: 0, time: 255.592 sec
index: 1, kl: 27.147, overlap pos: 1, overlap site: 0, time: 194.208 sec
index: 2, kl: 5.36, overlap pos: 63, overlap site: 9, time: 206.495 sec
index: 3, kl: 29.135, overlap pos: 0, overlap site: 0, time: 233.172 sec
index: 4, kl: 25.716, overlap pos: 0, overlap site: 0, time: 182.539 sec
index: 5, kl: 28.826, overlap pos: 4, overlap site: 1, time: 186.921 sec
index: 6, kl: 21.418, overlap pos: 12, overlap site: 2, time: 162.497 sec
index: 7, kl: 23.388, overlap pos: 0, overlap site: 0, time: 194.009 sec
index: 8, kl: 9.021, overlap pos: 35, overlap site: 5, time: 210.353 sec
index: 9, kl: 26.494, overlap pos: 0, overlap site: 0, time: 178.893 sec
index: 10, kl: 43.615, overlap pos: 0, overlap site: 0, time: 154.5 sec
index: 11, kl: 1.836, overlap pos: 49, overlap site: 7, time: 166.989 sec
index: 12, kl: 25.959, overlap pos: 0, overlap site: 0, time: 208.504 sec
index: 13, kl: 2.341, overlap pos: 63, overlap sit