In [9]:
import sys
sys.path.append("..")
from rosalind_tools.config import *
from rosalind_tools.utils import parse_fasta, Record
from typing import Type, List

Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.

Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)

In [22]:
def consensus(records: List[Record]) -> str:
    seq = np.array([[r.upper() for r in record.seq] for record in records])
    (h , w) = seq.shape
    # Create counting tables with default zeros.
    nucleobase_table = np.zeros((4, w), dtype=int)
    # A 0 ... 0
    # C 0 ... 0
    # G 0 ... 0
    # T 0 ... 0
    # iterate through columns
    for j in range(w):
        # iterate through rows
        for i in range(h):
            if seq[i, j] == 'A':
                nucleobase_table[0, j] += 1
            elif seq[i, j] == 'C':
                nucleobase_table[1, j] += 1
            elif seq[i, j] == 'G':
                nucleobase_table[2, j] += 1
            elif seq[i, j] == 'T':
                nucleobase_table[3, j] += 1
            else:
                continue
    consensus = ''
    for j in range(w):
        if np.max(nucleobase_table[:,j]) == nucleobase_table[0, j]:
            consensus += 'A'
        elif np.max(nucleobase_table[:,j]) == nucleobase_table[1, j]:
            consensus += 'C'
        elif np.max(nucleobase_table[:,j]) == nucleobase_table[2, j]:
            consensus += 'G'
        else:
            consensus += 'T'
    print(consensus)
    print('A: %s' % ' '.join(list(map(str, nucleobase_table[0]))))
    print('C: %s' % ' '.join(list(map(str, nucleobase_table[1]))))
    print('G: %s' % ' '.join(list(map(str, nucleobase_table[2]))))
    print('T: %s' % ' '.join(list(map(str, nucleobase_table[3]))))
    return
    

In [23]:
# Try sample dataset
with open(data_dir/"test_fasta2.txt", 'r') as f:
    records = parse_fasta(f)
    consensus(records)

ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6


In [25]:
# Try Rosalind dataset
with open(data_dir/"rosalind_cons.txt", 'r') as f:
    records = parse_fasta(f)
    consensus(records)

GTCAGAGGAACTTTAAGCCCACATTAAAACACAGGCCGGCCACAGTAAATGGTAGAGCCCGAATTGCGCCCAATTATCCAAGAGCGAGATCAAATTATGTACCTACCTAGATAACAAAAAACTAAATCCTCCCGATCTGGGTGCTCAAACAAGCGACGGGAAACCCGCCCTACAGAAAAACCCTTTCACCACCAGGTCCAAACACCGGAAACAAACGGTCATTCCTTCACCGCCGGCACATAGCCCGCGAAGCCAGTTCGAGCCTTGAACCAAAAACACAGGCTAGGGCCAGCCGCGTGTAAGGTGGTCTCTTCATATCTCAATGCACATTAGTGCGTTGATAAAGGCCGCTCCCATGGGAACAAAACCGCCGATAAACCGATTCCACAAACAGCATTTTAGCCGCCGTTCGTCCGACGACAGTAACGGTGAGGAAATCATAGCAACAAGCAACGGCCAAATGAGGACCACTTCTCCGCGGGACGCCTGACTACGCATTCTCGCTGAATGGAGACCCGCAAACCGGACGCCGGTGGCCATTAACCTGCAGCTAGAGCAAAAGCCTCGACTACATTGGTCGGACGGGAGCTAAGAACAAGGTAGAGTGGGGTCAATGTGCAATTCAGATACTGTGAACCCGCAACCACCCAGAAGGAGATTCACAGGGGTCGATCGAATATATAAGTGTCATGCTAGCTTCAGTGATACTGAACTTACGGCGCGGATTAATGGCCGCAGGAGCAATCCTCCTCACTCACCCTCTGAGGAATGATTATCTAAGCATCACAGACAAACGGCGTTAGGACTCCAATCCTTCGGGTGTACTCTTTGTTAACGCAAAACTCCAATAACATAAAAGGTCCCCAAGCGACATACCTGGCACGAATAATTACACGCTGGACCGACGAATGCGGTATCCTGCGTCAATCCATCACGTTGCGCTCAGCAAGCCCGCGAACCGCAAATTCAGTTCAAATAGTCAATTCC
A: 2 3 2 3 2