In [27]:
import pandas as pd
import numpy as np
import os
import re
from glob import glob

In [2]:
# Load reference sequence
refseq_df = pd.read_csv("./TestData/20200205_ssSeq/RefSeqs.csv")

# Convert all sequences to uppercase
anycase_refseqs = refseq_df.ReferenceSequence.values.tolist()
uppercase_refseqs = [refseq.upper() for refseq in anycase_refseqs]
refseq_df["ReferenceSequence"] = uppercase_refseqs

In [3]:
def log_error(e):
    print(e)

def find_codons_variable_positions(refseq, inframe_bp):
    
    # Get the number of codons and number of variable positions
    n_codons = refseq.count("NNN")
    n_positions = refseq.count("N")
    
    # Check to be sure the number of positions is divisible by 3 and divides
    # to the number of codons
    if (n_positions % 3) != 0:
        log_error("Must enter `N` in groups of 3 to signify codon.")
    assert (n_positions / 3) == n_codons, "Mismatch in number of codons and number of variable positions."

    # If no codons are found, return an empty array
    if n_codons == 0:
        return np.array([], dtype = int)
    
    # Split the reference sequence
    split_seqs = refseq.split("NNN")

    # Loop over the splits and make checks
    variable_positions = []
    variable_counter = 0
    for frag_ind, fragment in enumerate(split_seqs[:-1]):

        # Make sure fragments other than the first are divisible by 3, if not, then
        # the sections are not in frame
        frag_len = len(fragment)
        if (frag_ind > 0) and (not (frag_len % 3 == 0)):
            log_error("Specified variable positions are not in the same reading frame.")

        # Extend the variable positions. It will be extended by 
        # the fragment length
        variable_counter += frag_len
        for _ in range(3):
            variable_positions.append(variable_counter)
            variable_counter += 1

    # Convert to an array. Make sure everything is sorted.
    variable_positions = np.array(variable_positions)
    variable_positions.sort()

    # Make sure that all variable positions are "N" and that we found the appropriate
    # number of codons
    assert len(variable_positions) == n_positions, "Missing variable positions"
    assert all(refseq[pos] == "N" for pos in variable_positions), "Error in variable position calculation"
        
    # Validate that the provided in frame bp is correct
    if (variable_positions[0] % 3) != (inframe_bp - 1):
        log_error("The provided in frame base does not match the frame of the provided variable codons.")
    
    return variable_positions

In [4]:
test = {"!": "A"}

('!', 'A')

In [9]:
import gzip
import shutil

In [12]:
with gzip.open("./TestData/20200205_ssSeq/CHL2_S199_L001_R1_001.fastq.gz", 'rb') as f_in:
    with open('./TestData/20200205_ssSeq/TestUnzip.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [17]:
test = "./TestData/20200205_ssSeq/CHL2_S199_L001_R1_001.fastq.gz"
os.path.splitext(test)

('./TestData/20200205_ssSeq/CHL2_S199_L001_R1_001.fastq', '.gz')

In [18]:
test_df = pd.read_csv("./OutputCounts/Combos_Coupled_Max.csv")

In [19]:
test_df

Unnamed: 0,IndexPlate,Plate,Well,VariantCombo,VariantsFound,AlignmentFrequency,WellSeqDepth,VariantSequence
0,DI01,Plate01,A01,?17P_?18P_?25S_?28S,4,0.967213,295,ALQKHSVAISATMGRLPPERYPETSSLSELPERQIHKLASALLAYARS
1,DI01,Plate01,A02,?17P_?18P_?25F_?28D,4,0.983491,417,ALQKHSVAISATMGRLPPERYPETFSLDELPERQIHKLASALLAYARS
2,DI01,Plate01,A03,?17P_?18P_?25Y_?28A,4,0.972112,244,ALQKHSVAISATMGRLPPERYPETYSLAELPERQIHKLASALLAYARS
3,DI01,Plate01,A04,?17P_?18P_?25S_?28C,4,0.990291,408,ALQKHSVAISATMGRLPPERYPETSSLCELPERQIHKLASALLAYARS
4,DI01,Plate01,A05,H5H_?17C_?18P_?25G_?28C,5,0.285714,2,ALQKHSVAISATMGRLCPERYPETGSLCELPERQIHKLASALLAYARS
...,...,...,...,...,...,...,...,...
472,DI05,Plate05,H08,?17W_?18G_?25G_?28W,4,0.983558,658,ALQKHSVAISATMGRLWGERYPETGSLWELPERQIHKLASALLAYARS
473,DI05,Plate05,H09,?17W_?18C_?25A_?28W,4,0.974779,773,ALQKHSVAISATMGRLWCERYPETASLWELPERQIHKLASALLAYARS
474,DI05,Plate05,H10,?17G_?18C_?25D_?28W,4,0.986239,645,ALQKHSVAISATMGRLGCERYPETDSLWELPERQIHKLASALLAYARS
475,DI05,Plate05,H11,?17W_?18C_?25H_?28G,4,0.983306,589,ALQKHSVAISATMGRLWCERYPETHSLGELPERQIHKLASALLAYARS


In [25]:
# Split on plate
unique_plates = test_df.Plate.unique()

# Loop over dataframes
for unique_plate in unique_plates:
    df = test_df.loc[test_df.Plate == unique_plate].copy()

In [26]:
df

Unnamed: 0,IndexPlate,Plate,Well,VariantCombo,VariantsFound,AlignmentFrequency,WellSeqDepth,VariantSequence
382,DI05,Plate05,A01,?17W_?18C_?25S_?28G,4,0.977883,619,ALQKHSVAISATMGRLWCERYPETSSLGELPERQIHKLASALLAYARS
383,DI05,Plate05,A02,?17G_?18C_?25C_?28W_A46A,5,0.405010,194,ALQKHSVAISATMGRLGCERYPETCSLWELPERQIHKLASALLAYARS
384,DI05,Plate05,A03,?17G_?18G_?25A_?28W,4,0.992593,402,ALQKHSVAISATMGRLGGERYPETASLWELPERQIHKLASALLAYARS
385,DI05,Plate05,A04,?17W_?18G_?25C_?28W,4,0.976996,722,ALQKHSVAISATMGRLWGERYPETCSLWELPERQIHKLASALLAYARS
386,DI05,Plate05,A05,?17W_?18G_?25H_?28G,4,0.968807,528,ALQKHSVAISATMGRLWGERYPETHSLGELPERQIHKLASALLAYARS
...,...,...,...,...,...,...,...,...
472,DI05,Plate05,H08,?17W_?18G_?25G_?28W,4,0.983558,658,ALQKHSVAISATMGRLWGERYPETGSLWELPERQIHKLASALLAYARS
473,DI05,Plate05,H09,?17W_?18C_?25A_?28W,4,0.974779,773,ALQKHSVAISATMGRLWCERYPETASLWELPERQIHKLASALLAYARS
474,DI05,Plate05,H10,?17G_?18C_?25D_?28W,4,0.986239,645,ALQKHSVAISATMGRLGCERYPETDSLWELPERQIHKLASALLAYARS
475,DI05,Plate05,H11,?17W_?18C_?25H_?28G,4,0.983306,589,ALQKHSVAISATMGRLWCERYPETHSLGELPERQIHKLASALLAYARS


In [29]:
glob("./TestData/20200205_ssSeq/*fastq*")

['./TestData/20200205_ssSeq/SHORT_CHL2_S199_L001_R1_001.fastq',
 './TestData/20200205_ssSeq/CHL2_S199_L001_R2_001.fastq',
 './TestData/20200205_ssSeq/CHL2_S199_L001_R1_001.fastq',
 './TestData/20200205_ssSeq/SHORT_CHL2_S199_L001_R2_001.fastq',
 './TestData/20200205_ssSeq/CHL2_S199_L001_I1_001.fastq']