In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load reference sequence
refseq_df = pd.read_csv("./TestData/20200205_ssSeq/RefSeqs.csv")

# Convert all sequences to uppercase
anycase_refseqs = refseq_df.ReferenceSequence.values.tolist()
uppercase_refseqs = [refseq.upper() for refseq in anycase_refseqs]
refseq_df["ReferenceSequence"] = uppercase_refseqs

In [19]:
def log_error(e):
    print(e)

def find_codons_variable_positions(refseq, inframe_bp):
    
    # Get the number of codons and number of variable positions
    n_codons = refseq.count("NNN")
    n_positions = refseq.count("N")
    
    # Check to be sure the number of positions is divisible by 3 and divides
    # to the number of codons
    if (n_positions % 3) != 0:
        log_error("Must enter `N` in groups of 3 to signify codon.")
    assert (n_positions / 3) == n_codons, "Mismatch in number of codons and number of variable positions."

    # If no codons are found, return an empty array
#     if n_codons
    
    # Split the reference sequence
    split_seqs = refseq.split("NNN")

    # Loop over the splits and make checks
    variable_positions = []
    variable_counter = 0
    for frag_ind, fragment in enumerate(split_seqs[:-1]):

        # Make sure fragments other than the first are divisible by 3, if not, then
        # the sections are not in frame
        frag_len = len(fragment)
        if (frag_ind > 0) and (not (frag_len % 3 == 0)):
            log_error("Specified variable positions are not in the same reading frame.")

        # Extend the variable positions. It will be extended by 
        # the fragment length
        variable_counter += frag_len
        for _ in range(3):
            variable_positions.append(variable_counter)
            variable_counter += 1

    # Convert to an array. Make sure everything is sorted.
    variable_positions = np.array(variable_positions)
    variable_positions.sort()

    # Make sure that all variable positions are "N" and that we found the appropriate
    # number of codons
    assert len(variable_positions) == n_positions, "Missing variable positions"
    assert all(refseq[pos] == "N" for pos in variable_positions), "Error in variable position calculation"
        
    # Validate that the provided in frame bp is correct
    if (test[0] % 3) != (inframe_bp - 1):
        log_error("The provided in frame base does not match the frame of the provided variable codons.")
    
    return variable_positions

In [21]:
test = find_codons_variable_positions("ACADFNNNENNNCADF", 1)

Specified variable positions are not in the same reading frame.


IndexError: index 0 is out of bounds for axis 0 with size 0

In [11]:
test[0] % 3

0