In [2]:
# Import relevant modules from ssSeqSupport.
from ..code.globals import N_CPUS, AA_ARRAY
from ..code.logging import log_init, log_info, log_error
from ..code.input_processing import build_output_dirs
from ..code.run_deSeq import run_deseq
from ..code.run_deSeq import load_all, build_seqpairs, qc_seqpairs, assign_seqpairs_to_well, check_args

import numpy as np

ImportError: attempted relative import with no known parent package

In [4]:
# Define command line args
cl_args = {"refseq": "../data/test_data/DefaultRefSeqs.csv",
          "folder": "../data/test_data/",
           "fastq_r": "",
          "output": "./TEMP/VAL/",
          "read_length": None,
          "length_cutoff": 0.9,
          "average_q_cutoff": 25,
          "bp_q_cutoff": 30,
          "variable_thresh": 0.1,
          "variable_count": 1,
          "jobs": 23,
          "analysis_only": False,
          "stop_after_fastq": False,
           "return_alignments": False,
           "datetime": "ADFADF",
           "detailed_refseq": False
          }

# Build all output directories
build_output_dirs(cl_args)

# Log CLArgs
log_init(cl_args)  

# Check the input arguments
check_args(cl_args)

# Identify sequencing files and load reference sequence information
forward_file, reverse_file, bc_to_ref_plate_well = load_all(cl_args)

# Pair all sequences
all_seqpairs = build_seqpairs(forward_file, reverse_file)

# Run QC on the seqpairs
filtered_seqpairs = qc_seqpairs(all_seqpairs, cl_args["read_length"],
                                cl_args["length_cutoff"], 
                                cl_args["average_q_cutoff"])

# Assign seqpairs to a well
all_wells = assign_seqpairs_to_well(filtered_seqpairs, 
                                    bc_to_ref_plate_well,
                                    cl_args["output"])

Loading forward reads...


Parsing forward reads...: 100%|██████████| 453481/453481 [00:16<00:00, 27413.07it/s]


Loading reverse reads...


Pairing reverse reads...: 100%|██████████| 453481/453481 [00:27<00:00, 16593.70it/s]


Running read qc...
Assigning sequences to wells...


In [5]:
problem_well = [well for well in all_wells if well.index_plate == "DI02" and well.well == "E06"][0]

In [6]:
problem_well.align()

In [7]:
problem_well.analyze_alignments(30, 1)

True

In [8]:
problem_well.build_unit_count_matrices()

In [9]:
problem_well.identify_variable_positions(0.1)

''

In [13]:
variable_positions = problem_well.all_variable_aa_positions
all_counts = problem_well.all_aa_counts
unit_array = AA_ARRAY
reference_sequence = problem_well.reference_sequence_aa
variable_thresh = 0.2
variable_count = 10
pos_offset = problem_well.aa_ind_start

In [17]:
# Define output columns
columns = ("IndexPlate", "Plate", "Well", "VariantCombo", "SimpleCombo",
           "VariantsFound", "AlignmentFrequency", "WellSeqDepth",
           "VariantSequence", "Flags")

# Get the number of positions
n_positions = len(variable_positions)            

# Get the counts of alignments that are paired end
paired_alignment_inds = np.array([i for i, seqpair in enumerate(problem_well.non_dud_alignments)
                                  if seqpair.is_paired_post_alignment_qc()])

# Get the counts for the paired alignment seqpairs
paired_alignment_counts = all_counts[paired_alignment_inds]

# If there are no paired reads, return a dead dataframe
n_paired = len(paired_alignment_inds)

# Get the positions with variety
variable_position_counts = paired_alignment_counts[:, :, variable_positions]

# Make sure all passed QC. This means that each variable position has at least
# one count. This works because amino acids are only counted if they pass QC:
# for all to pass QC they must all have a count at some position
all_pos_at_least_one_count = np.all(variable_position_counts.sum(axis=1) >= 1, axis = 1)
passing_qc = variable_position_counts[all_pos_at_least_one_count].copy()

# Replace all instances where we have a count of 2 with 1. Counting at 
# this stage is by combo, so we don't worry about the sequencing depth
# of individual positions
passing_qc[passing_qc == 2] = 1
assert np.all(np.logical_or(passing_qc == 1, passing_qc == 0)), "Unexpected number of counts"

# Get the unique sequences that all passed QC
unique_binary_combos, unique_counts = np.unique(passing_qc, axis = 0, return_counts = True)

# We cannot have more counts than paired seqpairs
assert unique_counts.max() <= len(paired_alignment_inds), "Counting error"

# Get a frequency array
seq_depth = unique_counts.sum()
unique_freqs = unique_counts / seq_depth

# Loop over the unique combos and format for output
output = [None] * len(unique_counts)
for unique_counter, unique_binary_combo in enumerate(unique_binary_combos):

    # Get the index profile. This maps each position to a unit position
    # in either `BP_ARRAY` or `AA_ARRAY`
    index_profile = np.argwhere(np.transpose(unique_binary_combo > 0))

    # Get the position and amino acid.
    unique_position_array = variable_positions[index_profile[:, 0]]
    unique_combo = unit_array[index_profile[:, 1]]

    # Make sure the output is sorted
    assert np.all(np.diff(unique_position_array)), "Output not sorted"

    # Construct a sequence based on the reference
    # Construct a combo name based on the combo and position
    new_seq = list(reference_sequence)
    combo_name = [None] * n_positions
    simple_combo = combo_name.copy()
    for combo_ind, (pos, unit) in enumerate(zip(unique_position_array, unique_combo)):

        # Update the sequence
        new_seq[pos] = unit

        # Update the combo name. Add the offset to the position index to get
        # the start id of the reference seqeunce
        combo_name[combo_ind] = f"{reference_sequence[pos]}{pos + pos_offset}{unit}"

        # Update the simple combo name
        simple_combo[combo_ind] = unit

    # Convert the new seq and new combo into strings
    new_seq = "".join(new_seq)
    combo_name = "_".join(combo_name)
    simple_combo = "".join(simple_combo)

    # Record output
    output[unique_counter] = [problem_well.index_plate, problem_well.plate_nickname, problem_well.well,
                             combo_name, simple_combo, n_positions,
                             unique_freqs[unique_counter], seq_depth, new_seq, None]

In [18]:
output

[['DI02',
  'Plate02',
  'E06',
  '?10S_?13M_?34*_?38L',
  'SM*L',
  4,
  0.0021691973969631237,
  461,
  'ALQKHSVAISATMGRLLFERYPETRSLFELPER*IHKLASALLAYARS',
  None],
 ['DI02',
  'Plate02',
  'E06',
  '?10S_?13T_?34Q_?38L',
  'STQL',
  4,
  0.0021691973969631237,
  461,
  'ALQKHSVAISATTGRLLFERYPETRSLFELPERQIHKLASALLAYARS',
  None],
 ['DI02',
  'Plate02',
  'E06',
  '?10S_?13M_?34Q_?38L',
  'SMQL',
  4,
  0.9783080260303688,
  461,
  'ALQKHSVAISATMGRLLFERYPETRSLFELPERQIHKLASALLAYARS',
  None],
 ['DI02',
  'Plate02',
  'E06',
  '?10S_?13K_?34Q_?38L',
  'SKQL',
  4,
  0.0021691973969631237,
  461,
  'ALQKHSVAISATKGRLLFERYPETRSLFELPERQIHKLASALLAYARS',
  None],
 ['DI02',
  'Plate02',
  'E06',
  '?10S_?13I_?34Q_?38L',
  'SIQL',
  4,
  0.0021691973969631237,
  461,
  'ALQKHSVAISATIGRLLFERYPETRSLFELPERQIHKLASALLAYARS',
  None],
 ['DI02',
  'Plate02',
  'E06',
  '?10G_?13M_?34Q_?38L',
  'GMQL',
  4,
  0.0021691973969631237,
  461,
  'ALQKHSVAIGATMGRLLFERYPETRSLFELPERQIHKLASALLAYARS',
  None],
 