In [1]:
import tests.data_generation.globals as test_glob
from tests.data_generation.globals import SAVELOC
from tests.data_generation.run_generator import FakeRun
from tests.data_generation.stress_tests import run_aa_stress_test, check_well_is_parent, calculate_parent_counts

import os
import pickle
import random
import shutil
import numpy as np
import pandas as pd
from glob import glob

This is for stress-testing the evSeq code by passing in random inputs with known expected output. We then check to see if the evSeq outputs match the expected.

In [2]:
def run_evseq_stress_test(detailed, include_nnn):
    
    # Run until we break something
    counter = -1
    while True:
        
        # Update the counter
        counter += 1
        
        # Update the global RNG to match the counter (for reproducbility)
        test_glob.RANDOM_SEED = counter
        test_glob.NP_RNG = np.random.default_rng(counter)
        test_glob.RANDOM_RNG = random.Random(counter)
    
        # Build a test run and the associated output files
        test_run = FakeRun(detailed = detailed)
        test_run.build_fastq()
        test_run.build_refseq(include_nnn)

        # Run evSeq on the generated data
#         test_run.run_evseq()

        return test_run
        
        # Get the expected outputs
        expected_out = test_run.build_expected_aa()

        # Get the true outputs. Sort the true output in the same
        # way the expected was sorted.
        most_recent_run_path = sorted(glob(os.path.join(SAVELOC, "evSeqOutput", "*")))[-1]
        true_out = pd.read_csv(os.path.join(most_recent_run_path, "OutputCounts", 
                                            "AminoAcids_Decoupled_All.csv"))
        true_out.sort_values(by = ["IndexPlate", "Well", "AaPosition", "Aa"],
                             inplace = True)

        # Test the two dataframes to make sure they agree
        test_passed, bad_platewells, reports = test_aa(expected_out, true_out)
        
        # Note success if all tests passed and delete output
        if test_passed:
            print(f"All tests passed for seed: {counter}")
            shutil.rmtree(most_recent_run_path)
        
        # Save the error reports if there were any
        else:
            # Report errors
            for plate, well in bad_platewells:
                print(f"Errors found for {plate}-{well} with seed {counter}.")
                
            # Save the messed up components
            error_loc = os.path.join(SAVELOC, "ErrorReports")
            if not os.path.isdir(error_loc):
                os.mkdir(error_loc)
            with open(os.path.join(error_loc, f"{counter}.pkl"), "wb") as f:
                pickle.dump([bad_platewells, reports], f)

In [3]:
test_run = run_evseq_stress_test(True, True)

In [4]:
expected_decoupled_aa_df, expected_coupled_aa_df = test_run.build_expected_aa()    

In [5]:
expected_coupled_aa_df

Unnamed: 0,IndexPlate,Plate,Well,VariantCombo,SimpleCombo,VariantsFound,AlignmentFrequency,WellSeqDepth,VariantSequence,Flags
0,DI01,TestPlate01,A01,P3102P_*3108*_M3119M_G3138G_C3144C_Y3156Y_Q327...,P*MGCYQT,8,0.500000,34,PLFG*A*DTPWLNYRPMMWFTQ*IVM*SRTDAV*MAGLII**C*QM...,
1,DI01,TestPlate01,A01,P3102F_*3108N_M3119L_G3138R_C3144Q_Y3156M_Q327...,FNLRQM*N,8,0.500000,34,FLFG*ANDTPWLNYRPMLWFTQ*IVM*SRTDAV*MARLII**Q*QM...,
3,DI01,TestPlate01,A02,A9452L_F9466W_?9467F_?9470E_E9471E_T9482*_?948...,LWFEE*IQ,8,0.195652,92,ARDLVWFLLTPAPHEKVYCMRWFFEEEECCSGSNVMS*AMKGKILT...,
4,DI01,TestPlate01,A02,A9452A_F9466F_?9467F_?9470V_E9471L_T9482T_?948...,AFFVLTIQ,8,0.391304,92,ARDLVWFALTPAPHEKVYCMRFFFEVLECCSGSNVMSTAMKGKILT...,
2,DI01,TestPlate01,A02,A9452A_F9466F_?9467R_?9470H_E9471E_T9482T_?948...,AFRHETMC,8,0.413043,92,ARDLVWFALTPAPHEKVYCMRFRFEHEECCSGSNVMSTAMKGKMLT...,
...,...,...,...,...,...,...,...,...,...,...
1559,DI08,TestPlate08,H11,C6935M_K6960S_D6983M_?7017F_?7021C_F7023F_I7030S,MSMFCFS,7,0.685393,89,PK*PPTCEMYN*MNQMIYKWCSSHEVVNDKYEVSVRLPLQILAKGE...,
1562,DI08,TestPlate08,H12,A585S_C588E_W591N_Q607A_G609F_H610S_Y622G_?624...,SENAFSGFAHEHLNR,15,0.155172,58,SQPEALNLKPYAYDCGTKTYYKAIFSRNGKGMCCHSIGFFHPYMCT...,
1561,DI08,TestPlate08,H12,A585A_C588C_W591W_Q607Q_G609G_H610H_Y622Y_?624...,ACWQGHYWRHEHQHR,15,0.224138,58,AQPCALWLKPYAYDCGTKTYYKQIGHRNGKGMCCHSIYFWHPYMCT...,
1563,DI08,TestPlate08,H12,A585A_C588C_W591W_Q607Q_G609G_H610H_Y622Y_?624...,ACWQGHYFRKEHQHK,15,0.293103,58,AQPCALWLKPYAYDCGTKTYYKQIGHRNGKGMCCHSIYFFHPYMCT...,


In [13]:
expected_coupled_aa_df.VariantCombo.values[1]

'P3102P_*3108*_M3119M_G3138G_C3144C_Y3156Y_Q3274Q_T3291T'

In [21]:
if all(mutated_pos in test_run.wells[0].refseq.double_count_inds
       for mutated_pos in test_run.wells[0].variants[0].mutated_positions):
    print("HI")

HI


In [23]:
all([])

True

In [24]:
any([])

False

In [17]:
test_run.wells[0].variants[0].doub

17

In [18]:
test_run.wells[0].variants[1].total_counts

17

In [7]:
# Loop over all wells
for well in test_run.wells:

    # Skip dud wells
    if well.dud_well:
        continue

    # Gather expected mutation positions
    mut_key = (well.platename, well.wellname)
    nnn_key = mut_key if test_run.detailed else well.platename
    nnn_positions = expected_nnn_positions[nnn_key]
    mut_positions = expected_mut_positions[mut_key]
    all_positions = nnn_positions | mut_positions
    
    # If there are no positions OR the positions all give
    # the parent amino acid, this is a parent well
    is_parent = check_well_is_parent(well, all_positions, nnn_positions)
    
    break

In [35]:
# If this is a parent well, format as appropriate
if is_parent:
    
    res = [[
        well.platename,
        well.platenickname,
        well.wellname,
        "#PARENT#",
        "#PARENT#",
        0,
        1.0,
        calculate_parent_counts(well),
        "".join(well.refseq.aa_refseq),
        "#PARENT#"
    ]]
        
# Get a sorted list of all positions. 
all_positions_sorted = sorted(list(all_positions))

# Identify indices where the aa is (1) the same as the parent for all variants,
# and (2) not an "NNN" position, continue. Remove these from the variant pool
# as evSeq will not find it.
no_use_positions = []
for pos in all_positions_sorted:
    same_as_ref_check = all(variant.base_mut_aa_seq[pos] == well.refseq.aa_refseq[pos]
                            for variant in well.variants)
    if (pos not in nnn_positions) and same_as_ref_check:
        no_use_positions.append(pos)
no_use_positions = set(no_use_positions)
        
adjusted_positions = [pos + well.refseq.aa_ind_start
                      for pos in all_positions_sorted
                      if pos not in no_use_positions]
    
    
# Get variant counts and frequencies
combo_counts = [variant.expected_combo_counts for variant in well.variants]
total_counts = sum(combo_counts)
frequencies = combo_counts / total_counts
    
# Loop over all variants
well_res = [None] * len(well.variants)
for i, variant in enumerate(well.variants):
    
    # Grab the amino acid identities for both the variant and reference
    # at all positions. 
    all_ref_aas = [None] * len(all_positions_sorted)
    all_variant_aas = all_ref_aas.copy()
    for j, pos in enumerate(all_positions_sorted):
        all_ref_aas[j] = "?" if pos in nnn_positions else well.refseq.aa_refseq[pos]
        all_variant_aas[j] = variant.base_mut_aa_seq[pos]
    
    # Build the variant combo
    variant_combo = "_".join([f"{ref_aa}{pos}{mut_aa}" for ref_aa, pos, mut_aa in
                              zip(all_ref_aas, adjusted_positions, all_variant_aas)])
        
    # Build the expected outputs
    well_res[i] = [
        well.platename,
        well.platenickname,
        well.wellname,
        variant_combo,
        "".join(all_variant_aas),
        len(adjusted_positions),
        frequencies[i],
        total_counts,
        "".join(variant.base_mut_aa_seq)
    ] 

In [36]:
well_res

[['DI01',
  'TestPlate01',
  'A01',
  '?3102C_?3113D_V3126V_*3128*_A3154A_Y3160Y_?3168W_?3264R_?3271W_F3272F_R3278R_?3280H_?3288E_?3297Q_?3306Y_?3312K_?3318K_*3319*',
  'CDV*AYWRWFRHEQYKK*W',
  18,
  0.47058823529411764,
  68,
  'CLFG*A*DTPWDNYRPMMWFTQ*IVM*SRTDAV*MAGLII**C*QMPFNSIKYSYHQWWTQQIVCNRTMHGILRVAWMHQNFGRNLHRIGVFEQPAAHTISGFSVAAQHNDVKVSQESAMIYEWAPNVGVQVERWAHPCLPSWIKKYELAIWPHYNW*KTRIWIMESAIFRRQWWDHCEYWQYV*CQAYTYHDLYYVITELGEVKCWYFIKPKDWQ*WRD'],
 ['DI01',
  'TestPlate01',
  'A01',
  '?3102P_?3113L_V3126T_*3128K_A3154A_Y3160S_?3168P_?3264R_?3271L_F3272I_R3278F_?3280C_?3288R_?3297V_?3306H_?3312S_?3318M_*3319R',
  'PLTKASPRLIFCRVHSMRT',
  18,
  0.5294117647058824,
  68,
  'PLFG*A*DTPWLNYRPMMWFTQ*ITMKSRTDAV*MAGLII**C*QMPFNSIKSSYHQWPTQQIVCNRTMHGILRVAWMHQNFGRNLHRIGVFEQPAAHTISGFSVAAQHNDVKVSQESAMIYEWAPNVGVQVERWAHPCLPSWIKKYELAIWPHYNW*KTRILIMESAIIFRQWWDCCRYWQYV*CVAYTYHDLYHVITELGEVSCWYFIMPKDWQRTRD']]

To do:
1. Build code to test combinations of amino acids coming out
3. Build code to test the "max" files coming out
4. Add on code to test for DEAD wells.

In [17]:
test = np.array([1, 2, 3, 1, 2, 5])
np.unique(test, return_counts=True, return_inverse = True)


(array([1, 2, 3, 5]), array([0, 1, 2, 0, 1, 3]), array([2, 2, 1, 1]))

In [15]:
test_arr = np.random.rand(100, 23, 4)

In [16]:
np.all(test_arr < 0.5, axis = 1).shape

(100, 4)