In [79]:
# Import relevant modules from ssSeqSupport.
from Code.Globals import N_CPUS, AA_ARRAY
from Code.Logging import log_init, log_info, log_error
from Code.InputProcessing import build_output_dirs
from Code.RunDeSeq import run_deseq
from Code.RunDeSeq import load_all, build_seqpairs, qc_seqpairs, assign_seqpairs_to_well, check_args

import numpy as np

In [12]:
# Define command line args
cl_args = {"refseq": "./AlignmentDev/TestData/NatTiles/DefaultRefSeqs.csv",
          "folder": "./AlignmentDev/TestData/NatTiles/",
           "fastq_r": "",
          "output": "./TEMP",
          "read_length": None,
          "length_cutoff": 0.9,
          "average_q_cutoff": 25,
          "bp_q_cutoff": 30,
          "variable_thresh": 0.1,
          "variable_count": 1,
          "jobs": 23,
          "analysis_only": False,
          "stop_after_fastq": False,
           "return_alignments": False,
           "datetime": "ADFADF",
           "detailed_refseq": False
          }

# Build all output directories
build_output_dirs(cl_args)

# Log CLArgs
log_init(cl_args)  

# Check the input arguments
check_args(cl_args)

# Identify sequencing files and load reference sequence information
forward_file, reverse_file, bc_to_ref_plate_well = load_all(cl_args)

# Pair all sequences
all_seqpairs = build_seqpairs(forward_file, reverse_file)

# Run QC on the seqpairs
filtered_seqpairs = qc_seqpairs(all_seqpairs, cl_args["read_length"],
                                cl_args["length_cutoff"], 
                                cl_args["average_q_cutoff"])

# Assign seqpairs to a well
all_wells = assign_seqpairs_to_well(filtered_seqpairs, 
                                    bc_to_ref_plate_well,
                                    cl_args["output"])

Loading forward reads...


Parsing forward reads...: 100%|██████████| 649831/649831 [00:20<00:00, 31956.73it/s]


Loading reverse reads...


Pairing reverse reads...: 100%|██████████| 649831/649831 [00:32<00:00, 19887.13it/s]


Running read qc...
Assigning sequences to wells...


In [13]:
problem_well = [well for well in all_wells if well.index_plate == "DI03" and well.well == "G10"][0]

In [14]:
problem_well.align()

In [16]:
problem_well.analyze_alignments(30, 1)

True

In [17]:
problem_well.build_unit_count_matrices()

In [25]:
problem_well.identify_variable_positions(0.1)

In [26]:
problem_well.all_variable_aa_positions

array([15, 31, 35, 39, 40, 41, 42])

In [50]:
# Get the counts of alignments that are paired end
paired_alignment_inds = np.array([i for i, seqpair in enumerate(problem_well.non_dud_alignments)
                                  if seqpair.is_paired_post_alignment_qc()])

# Get the counts for the paired alignment seqpairs
paired_alignment_counts = problem_well.all_aa_counts[paired_alignment_inds]

# Get the positions with variety
variable_position_counts = paired_alignment_counts[:, :, problem_well.all_variable_aa_positions]

In [65]:
# Make sure all passed QC. This means that each variable position has at least
# one count. This works because amino acids are only counted if they pass QC:
# for all to pass QC they must all have a count at some position
all_pos_at_least_one_count = np.all(variable_position_counts.sum(axis=1) >= 1, axis = 1)
passing_qc = variable_position_counts[all_pos_at_least_one_count]
alt_passing_qc = variable_position_counts[variable_position_counts.sum(axis = (1, 2)) == len(problem_well.all_variable_aa_positions)]

In [95]:
# Get the number of positions
n_positions = len(problem_well.all_variable_aa_positions)            

# Get the unique sequences that all passed QC
unique_binary_combos, unique_counts = np.unique(passing_qc, axis = 0, return_counts = True)

# We cannot have more counts than paired seqpairs
assert unique_counts.max() <= len(paired_alignment_inds), "Counting error"

# Get a frequency array
seq_depth = unique_counts.sum()
unique_freqs = unique_counts / seq_depth

# Loop over the unique combos and format for output
output = [None] * len(unique_counts)
for unique_counter, unique_binary_combo in enumerate(unique_binary_combos):

    # Get the index profile. This maps each position to a unit position
    # in either `BP_ARRAY` or `AA_ARRAY`
    index_profile = np.argwhere(np.transpose(unique_binary_combo > 0))

    # Get the position and amino acid.
    unique_position_array = problem_well.all_variable_aa_positions[index_profile[:, 0]]
    unique_combo = AA_ARRAY[index_profile[:, 1]]

    # Make sure the output is sorted
    assert np.all(np.diff(unique_position_array)), "Output not sorted"

    # Construct a sequence based on the reference
    # Construct a combo name based on the combo and position
    new_seq = list(problem_well.reference_sequence_aa)
    combo_name = [None] * n_positions
    simple_combo = combo_name.copy()
    for combo_ind, (pos, unit) in enumerate(zip(unique_position_array, unique_combo)):

        # Update the sequence
        new_seq[pos] = unit

        # Update the combo name. Add the offset to the position index to get
        # the start id of the reference seqeunce
        combo_name[combo_ind] = f"{problem_well.reference_sequence_aa[pos]}{pos + 1}{unit}"

        # Update the simple combo name
        simple_combo[combo_ind] = unit

    # Convert the new seq and new combo into strings
    new_seq = "".join(new_seq)
    combo_name = "_".join(combo_name)
    simple_combo = "".join(simple_combo)

In [97]:
index_profile

array([[ 0,  2],
       [ 1, 16],
       [ 2, 12],
       [ 3, 14],
       [ 4,  3],
       [ 5,  1],
       [ 6,  9]])

In [90]:
unique_position_array

array([15, 31, 35, 40, 41, 42])

In [88]:
problem_well.all_variable_aa_positions

array([15, 31, 35, 39, 40, 41, 42])

In [86]:
unique_position_array

array([15, 31, 35, 40, 41, 42])

In [63]:
passing_qc.shape

(1, 23, 7)

In [73]:
for seqpair in problem_well.all_seqpairs:
    print("F:", seqpair.f_average_q)
    print("R:", seqpair.r_average_q)

F: 38.086092715231786
R: 34.17283950617284
F: 32.75496688741722
R: 30.33112582781457
F: 31.23841059602649
R: 28.132450331125828
F: 35.5364238410596
R: 27.288590604026847
F: 37.82781456953642
R: 34.69798657718121
F: 35.6953642384106
R: 22.825503355704697
F: 37.54304635761589
R: 31.959731543624162
F: 37.3046357615894
R: 26.892617449664428
F: 38.08
R: 34.04635761589404
F: 35.22
R: 25.24503311258278
F: 38.22516556291391
R: 34.328859060402685
F: 36.3046357615894
R: 33.630872483221474
F: 37.728476821192054
R: 27.892617449664428
F: 36.83443708609271
R: 29.93288590604027
F: 37.980132450331126
R: 35.46308724832215
F: 35.079470198675494
R: 27.436241610738254
F: 31.91390728476821
R: 22.22818791946309
F: 38.152317880794705
R: 36.51677852348993
F: 37.10204081632653
R: 33.64
F: 38.24503311258278
R: 34.18791946308725
F: 37.033112582781456
R: 26.59731543624161
F: 36.95652173913044
R: 33.53691275167785
F: 34.496688741721854
R: 36.6530612244898
F: 32.72185430463576
R: 20.206666666666667
F: 37.8211920529