In [45]:
# Import relevant modules from ssSeqSupport.
from Code.Globals import N_CPUS, AA_ARRAY
from Code.Logging import log_init, log_info, log_error
from Code.InputProcessing import build_output_dirs
from Code.RunDeSeq import run_deseq
from Code.RunDeSeq import load_all, build_seqpairs, qc_seqpairs, assign_seqpairs_to_well, check_args

import numpy as np

In [2]:
# Define command line args
cl_args = {"refseq": "./deSeqValidation/TestRefSeq.csv",
          "folder": "./deSeqValidation/",
           "fastq_r": "",
          "output": "./TEMP/",
          "read_length": None,
          "length_cutoff": 0.9,
          "average_q_cutoff": 25,
          "bp_q_cutoff": 30,
          "variable_thresh": 0.1,
          "variable_count": 1,
          "jobs": 23,
          "analysis_only": False,
          "stop_after_fastq": False,
           "return_alignments": False,
           "datetime": "ADFADF",
           "detailed_refseq": True
          }

# Build all output directories
build_output_dirs(cl_args)

# Log CLArgs
log_init(cl_args)  

# Check the input arguments
check_args(cl_args)

# Identify sequencing files and load reference sequence information
forward_file, reverse_file, bc_to_ref_plate_well = load_all(cl_args)

# Pair all sequences
all_seqpairs = build_seqpairs(forward_file, reverse_file)

# Run QC on the seqpairs
filtered_seqpairs = qc_seqpairs(all_seqpairs, cl_args["read_length"],
                                cl_args["length_cutoff"], 
                                cl_args["average_q_cutoff"])

# Assign seqpairs to a well
all_wells = assign_seqpairs_to_well(filtered_seqpairs, 
                                    bc_to_ref_plate_well,
                                    cl_args["output"])

Loading forward reads...


Parsing forward reads...: 100%|██████████| 38396/38396 [00:01<00:00, 26728.43it/s]


Loading reverse reads...


Pairing reverse reads...: 100%|██████████| 38396/38396 [00:02<00:00, 17646.66it/s]


Running read qc...
Assigning sequences to wells...


In [23]:
problem_well = [well for well in all_wells if well.index_plate == "DI05" and well.well == "G06"][0]

In [24]:
problem_well.align()

In [25]:
problem_well.analyze_alignments(30, 1)

True

In [26]:
problem_well.build_unit_count_matrices()

In [27]:
problem_well.identify_variable_positions(0.1)

AssertionError: 

In [8]:
problem_well.all_variable_aa_positions

array([  0,   9,  17,  32,  35,  74,  75,  76,  79,  82,  91,  94,  97,
       105, 106, 107, 112])

In [9]:
problem_well.all_variable_bp_positions

array([  0,   1,  28,  51,  53,  96,  98, 107, 222, 224, 225, 226, 227,
       229, 237, 239, 246, 247, 248, 254, 273, 274, 275, 282, 292, 315,
       316, 318, 319, 320, 321, 322, 336, 337, 338, 342])

In [28]:
by_unit_frequency = problem_well.unit_bp_freqs_no_gaps
expected_array = problem_well.expected_bps[:-1]
variable_thresh = cl_args["variable_thresh"]
expected_variable_positions = np.array([])

In [29]:
difference_from_expectation_absolute = np.abs(by_unit_frequency - expected_array)
average_difference_from_expectation = np.sum(difference_from_expectation_absolute, axis = 0)/2

In [30]:
# Get the total frequencies of each well
total_frequencies = by_unit_frequency.sum(axis=0)

In [38]:
ones_array = np.ones(len(total_frequencies))
zeros_array = np.zeros(len(total_frequencies))

np.logical_or(np.isclose(total_frequencies, ones_array),
              np.isclose(total_frequencies, zeros_array))

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [37]:
total_frequencies[-2]

0.9999999999999999

In [32]:
# Get the total frequencies of each well
total_frequencies = by_unit_frequency.sum(axis=0)

# Assert that it is all 1 or 0
assert np.all(np.logical_or(total_frequencies == 1, total_frequencies == 0))

# The only way we can have a total frequency of 0 is if we are in a gap region. 
# This is because we are explicitly ignoring gaps in this calculation. The next
# code identifies gaps
gap_positions = total_frequencies == 0

# Get the length of the unit frequency first axis
n_units = by_unit_frequency.shape[0]

# Compare the unit frequency to the expected array.
# The furthest difference is 2 (e.g. if there are no reads matching to the
# expected sequence), so take the absolute value is taken and the full
# array divided by 2 to scale to a "percent different"
difference_from_expectation_absolute = np.abs(by_unit_frequency - expected_array[:n_units])
average_difference_from_expectation = np.sum(difference_from_expectation_absolute, axis = 0)/2

# Set the gap positions to have a difference of 0
average_difference_from_expectation[gap_positions] = 0

AssertionError: 

In [42]:
total_possible_positions = sum(np.logical_not(gap_positions))

In [43]:
total_possible_positions

247

In [44]:
123+124

247

In [14]:
# Get the counts of alignments that are paired end
paired_alignment_inds = np.array([i for i, seqpair in enumerate(problem_well.non_dud_alignments)
                                  if seqpair.is_paired_post_alignment_qc()])

# Get the counts for the paired alignment seqpairs
paired_alignment_counts = problem_well.all_aa_counts[paired_alignment_inds]

# Get the positions with variety
variable_position_counts = paired_alignment_counts[:, :, problem_well.all_variable_aa_positions]

In [15]:
# Make sure all passed QC. This means that each variable position has at least
# one count. This works because amino acids are only counted if they pass QC:
# for all to pass QC they must all have a count at some position
all_pos_at_least_one_count = np.all(variable_position_counts.sum(axis=1) >= 1, axis = 1)
passing_qc = variable_position_counts[all_pos_at_least_one_count]
alt_passing_qc = variable_position_counts[variable_position_counts.sum(axis = (1, 2)) == len(problem_well.all_variable_aa_positions)]

In [16]:
# Get the number of positions
n_positions = len(problem_well.all_variable_aa_positions)            

# Get the unique sequences that all passed QC
unique_binary_combos, unique_counts = np.unique(passing_qc, axis = 0, return_counts = True)

# We cannot have more counts than paired seqpairs
assert unique_counts.max() <= len(paired_alignment_inds), "Counting error"

# Get a frequency array
seq_depth = unique_counts.sum()
unique_freqs = unique_counts / seq_depth

# Loop over the unique combos and format for output
output = [None] * len(unique_counts)
for unique_counter, unique_binary_combo in enumerate(unique_binary_combos):

    # Get the index profile. This maps each position to a unit position
    # in either `BP_ARRAY` or `AA_ARRAY`
    index_profile = np.argwhere(np.transpose(unique_binary_combo > 0))

    # Get the position and amino acid.
    unique_position_array = problem_well.all_variable_aa_positions[index_profile[:, 0]]
    unique_combo = AA_ARRAY[index_profile[:, 1]]

    # Make sure the output is sorted
    assert np.all(np.diff(unique_position_array)), "Output not sorted"

    # Construct a sequence based on the reference
    # Construct a combo name based on the combo and position
    new_seq = list(problem_well.reference_sequence_aa)
    combo_name = [None] * n_positions
    simple_combo = combo_name.copy()
    for combo_ind, (pos, unit) in enumerate(zip(unique_position_array, unique_combo)):

        # Update the sequence
        new_seq[pos] = unit

        # Update the combo name. Add the offset to the position index to get
        # the start id of the reference seqeunce
        combo_name[combo_ind] = f"{problem_well.reference_sequence_aa[pos]}{pos + 1}{unit}"

        # Update the simple combo name
        simple_combo[combo_ind] = unit

    # Convert the new seq and new combo into strings
    new_seq = "".join(new_seq)
    combo_name = "_".join(combo_name)
    simple_combo = "".join(simple_combo)

In [17]:
index_profile

array([[ 0, 11],
       [ 1,  5],
       [ 2,  7],
       [ 3,  9],
       [ 4,  6],
       [ 5, 18],
       [ 6, 12],
       [ 7,  9],
       [ 8, 19],
       [ 9, 10],
       [10,  1],
       [11, 14],
       [12,  6],
       [13,  4],
       [14, 17],
       [15, 19],
       [16, 14]])

In [18]:
unique_position_array

array([  0,   9,  17,  32,  35,  74,  75,  76,  79,  82,  91,  94,  97,
       105, 106, 107, 112])

In [19]:
problem_well.all_variable_aa_positions

array([  0,   9,  17,  32,  35,  74,  75,  76,  79,  82,  91,  94,  97,
       105, 106, 107, 112])

In [20]:
unique_position_array

array([  0,   9,  17,  32,  35,  74,  75,  76,  79,  82,  91,  94,  97,
       105, 106, 107, 112])

In [21]:
passing_qc.shape

(38, 23, 17)

In [22]:
for seqpair in problem_well.all_seqpairs:
    print("F:", seqpair.f_average_q)
    print("R:", seqpair.r_average_q)

F: 34.68
R: 33.95333333333333
F: 34.74666666666667
R: 34.36
F: 34.52
R: 34.67333333333333
F: 35.053333333333335
R: 34.58
F: 35.06666666666667
R: 34.04
F: 34.28
R: 34.446666666666665
F: 34.54
R: 34.84
F: 34.04666666666667
R: 34.34
F: 34.666666666666664
R: 34.42
F: 34.193333333333335
R: 34.60666666666667
F: 34.72666666666667
R: 34.04
F: 34.28666666666667
R: 34.58
F: 34.46666666666667
R: 34.71333333333333
F: 34.766666666666666
R: 34.18
F: 34.873333333333335
R: 35.00666666666667
F: 34.06
R: 34.54
F: 34.526666666666664
R: 34.78
F: 34.233333333333334
R: 34.60666666666667
F: 34.53333333333333
R: 34.473333333333336
F: 34.62
R: 34.14666666666667
F: 34.4
R: 34.373333333333335
F: 34.76
R: 34.49333333333333
F: 33.86
R: 34.54
F: 34.8
R: 34.5
F: 34.62
R: 34.586666666666666
F: 34.42666666666667
R: 34.72666666666667
F: 34.32666666666667
R: 34.306666666666665
F: 34.8
R: 34.24666666666667
F: 34.513333333333335
R: 34.32
F: 34.233333333333334
R: 34.42666666666667
F: 34.81333333333333
R: 34.42
F: 34.34
R: 