In [1]:
import pandas as pd
import os
datasetfname = os.path.join("input","datasets.xlsx")
datapath = os.path.join("central","groups","murray-biocircuits","ashur","nanopore")

readsname = "allreads.fastq"
outname = "simprec.py"

df_data = pd.read_excel(datasetfname,sheet_name="alldata")
df_inducers = pd.read_excel(datasetfname,sheet_name="inducers",header=13)
df_seqs = pd.read_excel(datasetfname,sheet_name="sequences")

#layout
#            plasmid                    genome        
#================================---------------------------
#   [bc>  [u21r><attB]<plasbc]<attL]   <uintf] <rc(bc)]
#<barcode><prefixseq><variable_seq><postfixseq><rc(barcode)>
#<barcode><newplasmidend1><plasbc1><genomechunk><rc(barcode)>



In [2]:
df_data

Unnamed: 0,date_sequenced,barcode,genome,int_control,plasmid1,plasmid2,primerf,primerr,c1,c2,c3,c4,see,prefix,variable1,variable2,suffix,note
0,180201,L10R49,B_gen,s22,Bcsoo,,UintF,,"25,[atc2]",,,,NaT,,,,,
1,180201,L10R126,B_gen,s22,Bcsoo,,UintF,,"25,[atc1]",,,,NaT,,,,,
2,180201,L10L10,B_gen,s22,Bcsoo,,UintF,,"25,[atc]",,,,NaT,,,,,
3,180201,R49R49,B_gen,s22,Bcsoo,,UintF,,"25,[]",,,,NaT,,,,,
4,180201,L10R49,B_gen,s22,Bpcpc,,UintF,,"25,[atc2]",,,,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,201212,npbc6,,REC49,i70,i71-2,UintF,chlorR,"5,[sal1]","25,[sal1]",,,2020-12-11,"[newplasmend2,newplasmend1]",plasbc1,plasbc2,genomechunk,
178,201212,npbc7,,REC49,i70,i71-2,UintF,chlorR,"5,[sal1]","25,[sal1,atc,cin]",,,2020-12-11,"[newplasmend2,newplasmend1]",plasbc1,plasbc2,genomechunk,
179,201212,npbc8,,REC49,i70,i71-2,UintF,chlorR,"5,[sal1,atc,ara]","25,[sal1,atc,cin]",,,2020-12-11,"[newplasmend2,newplasmend1]",plasbc1,plasbc2,genomechunk,
180,201212,npbc9,,REC49,i70,i71-2,UintF,chlorR,"5,[sal1]","25,[sal1,atc,ara]",,,2020-12-11,"[newplasmend2,newplasmend1]",plasbc1,plasbc2,genomechunk,


In [5]:
seqlist = list(df_data.date_sequenced.unique())

processreads = 1000
frontchecklength = 175
threshfrac = 0.3
#bc_threshold = len(barcodes["BC01"][0])*threshfrac
#variable_sequence_threshold = len(bc1)*threshfrac
#prefix_detection_threshold = len(prefix_seq)*threshfrac
#end_threshold = len(postfix_seq)*threshfrac


for seqdataset in seqlist:
    
    subdf = df_data[df_data.date_sequenced==seqdataset]
    #prefix sequence(s)
    prefixes = list(subdf.prefix.unique())
    prefixseqslist = []
    for prefix in prefixes:
        if(pd.isna(prefix)):
            prefixlist = []
        elif("," in prefix):
            prefixlist = prefix.replace("[","").replace("]","").split(",")    
        else:
            prefixlist = [prefix]
        for pref in prefixlist:
            prefixseqslist += [df_seqs[df_seqs.name==pref].sequence.iloc[0]]
    #barcode sequences
    barcodes = list(subdf.barcode.unique())
    bcseqs = {}
    bcnames = []
    condlist = []
    for bc in barcodes:
        bcseq = df_seqs[df_seqs.name==bc].sequence.iloc[0]
        bcseqs[bc] = bcseq
        #CONDITIONS
        condstr = ""
        for i in range(4):
            cond = subdf[subdf.barcode==bc]["c"+str(i+1)].iloc[0]
            if(pd.isna(cond)):
                break
            if(condstr != ""):
                condstr += "_"+str(cond)
            else:
                condstr+=str(cond)
        condlist += [condstr]
    #plasmid barcodes
    plasbcs = []
    plasmidbarcodes = zip(list(subdf.variable1.unique()),list(subdf.variable2.unique()))
    for plasbc1,plasbc2 in plasmidbarcodes:
        if(pd.isna(plasbc1)):
            plasbcs += [[]]
        elif(pd.isna(plasbc2)):
            plasbcs += [[df_seqs[df_seqs.name==plasbc1].sequence.iloc[0]]]
        else:
            plasbcs += [[df_seqs[df_seqs.name==plasbc1].sequence.iloc[0],\
                         df_seqs[df_seqs.name==plasbc2].sequence.iloc[0]]]
    #postfix
    postfix = []
    postfixes = list(subdf.suffix.unique())
    for pfix in postfixes:
        pfixseq = df_seqs[df_seqs.name==pfix].sequence
        if(len(pfixseq)==0):
          postfix += [""]
        else:
            postfix+=[pfixseq.iloc[0]]
    if(any([pd.isna(bcseqs[a]) for a in bcseqs])):
        continue
    fastqfilename = os.path.join(datapath,str(seqdataset),readsname)
    print(f"date of seq is {seqdataset}")
    print(f"get reads from {fastqfilename}")
    print(f"prefixseqslist is {prefixseqslist}")
    print(f"plasbcs is {plasbcs}")
    print(f"postfixseq is {postfix}")
    print(f"bcseqs is {bcseqs}")
    print(f"condnames is {condlist}")
    print(f"put output into {os.path.join(datapath,str(seqdataset),str(seqdataset)+'_'+str(outname))}")
    ''
    allseqDict,seqstats,unsorted=barcodeSplitAndCountRecords(fastqfilename,bcseqs,\
                                                barcode_detection_threshold=len(bcseqs[0])*threshfrac,\
                                                end_threshold=len(postfix[0])*threshfrac,\
                                                processreads=processreads,\
                                                variable_sequences=plasbcs,\
                                                prefix_sequence=prefixseqslist,\
                                                postfix_sequence=postfix,\
                                               prefix_detection_threshold=len(prefixseqslist[0])*threshfrac,\
                                               variable_sequence_threshold=len(plasbcs[0])*threshfrac,\
                                                frontchecklength=frontchecklength,visualize=False,progressbar = False)
    #'''



date of seq is 181113
get reads from central\groups\murray-biocircuits\ashur\nanopore\181113\allreads.fastq
prefixseqslist is ['AGGTATGATCCTGACGACGGAGCACGCCGTCGTCGACAAGCC']
plasbcs is [['CTGACAGCTAGCTCAGTCCTAGGTATAATGCTAGC', 'TTTCAATTTAATCATCCGGCTCGTATAATGTGTGGA']]
postfixseq is ['CAAGCCCATTATTACCCTGTTATCCCTAGACACCAATCAGAGGCCACA']
bcseqs is {'npbc1': 'AAGAAAGTTGTCGGTGTCTTTGT', 'npbc2': 'TCGATTCCGTTTGTAGTCGTCTG', 'npbc3': 'GAGTCTTGTGTCCCAGTTACCAG', 'npbc4': 'TTCGGATTCTATCGTGTTTCCCT', 'npbc5': 'CTTGTCCAGGGTTTGTGTAACCT', 'npbc7': 'GTGTTACCGTGGGAATGAATCCT', 'npbc8': 'TTCAGGGAACAAACCAAGTTACG', 'npbc9': 'AACTAGGCACAGCGAGTCTTGGT', 'npbc10': 'AAGCGTTGAAACCTTTGTCCTCT', 'npbc11': 'GTTTCATCTATCGGAGGGAATGG'}
condnames is ['4,[sal1]_4,[sal1]_4,[sal1]_25,[sal1]', '4,[atc1]_4,[atc1]_4,[atc1]_25,[atc1]', '4,[atc1]_4,[atc1]_4,[atc1]_25,[atc1]', '4,[sal1,atc,ara]_4,[sal1,atc,ara]_4,[sal1,atc,ara]_25,[sal1,atc,cin]', '4,[sal,atc1,ara]_4,[sal,atc1,ara]_4,[sal,atc1,ara]_25,[sal,atc1,cin]', '4,[sal1]_4,[sal