In [1]:
# Test that input works
input_text = open("CONS_sampleinput.txt","r")
raw_list = input_text.readlines()
print(raw_list)

In [2]:
# Drop the FASTA IDs, we don't need them. And strip the newline character off too
strands = []
for i in range(1, len(raw_list), 2): # 1 is the starting index, 2 is the increment. so we're only using odd indices here
    strands.append(raw_list[i].rstrip("\n"))
print(strands)

In [3]:
# Turn every sequence into a list. 
sequences = []
for i in range(len(strands)):
    sequences.append(list((strands[i])))
print(sequences)

[['A', 'T', 'C', 'C', 'A', 'G', 'C', 'T'], ['G', 'G', 'G', 'C', 'A', 'A', 'C', 'T'], ['A', 'T', 'G', 'G', 'A', 'T', 'C', 'T'], ['A', 'A', 'G', 'C', 'A', 'A', 'C', 'C'], ['T', 'T', 'G', 'G', 'A', 'A', 'C', 'T'], ['A', 'T', 'G', 'C', 'C', 'A', 'T', 'T'], ['A', 'T', 'G', 'G', 'C', 'A', 'C', 'T']]


In [4]:
# Create a matrix of nucleotides using NumPy
import numpy as np
seq_matrix = np.array(sequences, dtype = "str").transpose()
print(seq_matrix)

[['A' 'G' 'A' 'A' 'T' 'A' 'A']
 ['T' 'G' 'T' 'A' 'T' 'T' 'T']
 ['C' 'G' 'G' 'G' 'G' 'G' 'G']
 ['C' 'C' 'G' 'C' 'G' 'C' 'G']
 ['A' 'A' 'A' 'A' 'A' 'C' 'C']
 ['G' 'A' 'T' 'A' 'A' 'A' 'A']
 ['C' 'C' 'C' 'C' 'C' 'T' 'C']
 ['T' 'T' 'T' 'C' 'T' 'T' 'T']]


In [5]:
# Initialize empty NumPy array. First column is nucleobase A, second C, third G, fourth T
ACGT = np.empty((np.shape(seq_matrix)[0],4), dtype = "int")
print(ACGT)

[[4128860 6029375 3801155 5570652]
 [6619251 7536754 5374044 6357113]
 [6029422 7209057 6488161 7209071]
 [6357092 6029363 6881388 6029410]
 [6881395 6619252 7340077 6488161]
 [6357099 6619239 6029427 7667822]
 [7340141 6029433 6881388 6029410]
 [7602293 7077993 3014771 7929968]]


In [6]:
# Fill in the counts
for row in range(len(ACGT)):
    ACGT[row][0] = (seq_matrix[row] == "A").sum()
    ACGT[row][1] = (seq_matrix[row] == "C").sum()
    ACGT[row][2] = (seq_matrix[row] == "G").sum()
    ACGT[row][3] = (seq_matrix[row] == "T").sum()
print(ACGT)

[[5 0 1 1]
 [1 0 1 5]
 [0 1 6 0]
 [0 4 3 0]
 [5 2 0 0]
 [5 0 1 1]
 [0 6 0 1]
 [0 1 0 6]]


In [7]:
# Get indices of max values
ind = np.argwhere(ACGT==np.amax(ACGT,1, keepdims=True))
ind = list(map(tuple, ind)) # put in tuple form
# Source: https://stackoverflow.com/questions/61229657/find-index-of-max-value-in-each-row-of-2d-array
consensus_ind = []
for i in range(len(ind)):
    consensus_ind.append(ind[i][1])
print(consensus_ind)

[0, 3, 2, 1, 0, 0, 1, 3]


In [8]:
# Use indices to create consensus sequence in list form
transcribe = {0:"A",1:"C",2:"G",3:"T"}
consensus_sequence = [transcribe[n] for n in consensus_ind]
print(consensus_sequence)

['A', 'T', 'G', 'C', 'A', 'A', 'C', 'T']


In [9]:
# Write true consensus
consensus = ""
consensus = consensus.join(consensus_sequence)
print(consensus)

# Transpose the counts matrix
ACGT = ACGT.transpose()
print(ACGT)

ATGCAACT
[[5 1 0 0 5 5 0 0]
 [0 0 1 4 2 0 6 1]
 [1 1 6 3 0 1 0 0]
 [1 5 0 0 0 1 1 6]]


The purpose of the code below is to create a text file with the correct Rosalind formatting. The code below is intended to be finished later, it's been two full hours. -Ryan, 9:17 pm, 4/21/21

In [10]:
# Convert to list
ACGT = ACGT.tolist()
print(ACGT)

[[5, 1, 0, 0, 5, 5, 0, 0], [0, 0, 1, 4, 2, 0, 6, 1], [1, 1, 6, 3, 0, 1, 0, 0], [1, 5, 0, 0, 0, 1, 1, 6]]


In [13]:
output_list = ["A: ", "C: ", "G: ", "T: "]
for row in range(len(ACGT)):
    for num in range(len(ACGT[0])):
        output_list[row] = output_list[row] + " ".join(str() for a in ACGT)
print(output_list)


['A: [5, 1, 0, 0, 5, 5, 0, 0] [0, 0, 1, 4, 2, 0, 6, 1] [1, 1, 6, 3, 0, 1, 0, 0] [1, 5, 0, 0, 0, 1, 1, 6]', 'C: [5, 1, 0, 0, 5, 5, 0, 0] [0, 0, 1, 4, 2, 0, 6, 1] [1, 1, 6, 3, 0, 1, 0, 0] [1, 5, 0, 0, 0, 1, 1, 6]', 'G: [5, 1, 0, 0, 5, 5, 0, 0] [0, 0, 1, 4, 2, 0, 6, 1] [1, 1, 6, 3, 0, 1, 0, 0] [1, 5, 0, 0, 0, 1, 1, 6]', 'T: [5, 1, 0, 0, 5, 5, 0, 0] [0, 0, 1, 4, 2, 0, 6, 1] [1, 1, 6, 3, 0, 1, 0, 0] [1, 5, 0, 0, 0, 1, 1, 6]']


In [12]:
# Open output file
outputtxt = open("CONS_output.txt","w")