In [1]:
import pandas as pd
import os
datasetfname = os.path.join("input","datasets.xlsx")
datapath = os.path.join("central","groups","murray-biocircuits","ashur","nanopore")

readsname = "allreads.fastq"
outname = "simprec.py"

df_data = pd.read_excel(datasetfname,sheet_name="alldata",converters={"date_sequenced":str})
df_inducers = pd.read_excel(datasetfname,sheet_name="inducers",header=13)
df_seqs = pd.read_excel(datasetfname,sheet_name="sequences")

#layout
#            plasmid                    genome        
#================================---------------------------
#   [bc>  [u21r><attB]<plasbc]<attL]   <uintf] <rc(bc)]
#<barcode><prefixseq><variable_seq><postfixseq><rc(barcode)>
#<barcode><newplasmidend1><plasbc1><genomechunk><rc(barcode)>



In [2]:
df_data

Unnamed: 0,date_sequenced,barcode,genome,int_control,plasmid1,plasmid2,primerf,primerr,c1,c2,c3,c4,see,prefix,variable1,variable2,suffix,note
0,180201,L10R49,B_gen,s22,Bcsoo,,UintF,,"25,[atc2]",,,,NaT,,,,,
1,180201,L10R126,B_gen,s22,Bcsoo,,UintF,,"25,[atc1]",,,,NaT,,,,,
2,180201,L10L10,B_gen,s22,Bcsoo,,UintF,,"25,[atc]",,,,NaT,,,,,
3,180201,R49R49,B_gen,s22,Bcsoo,,UintF,,"25,[]",,,,NaT,,,,,
4,180201,L10R49,B_gen,s22,Bpcpc,,UintF,,"25,[atc2]",,,,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,201212,npbc6,,REC49,i70,i71-2,UintF,chlorR,"5,[sal1]","25,[sal1]",,,2020-12-11,"[newplasmend2,newplasmend1]",plasbc1,plasbc2,genomechunk,
178,201212,npbc7,,REC49,i70,i71-2,UintF,chlorR,"5,[sal1]","25,[sal1,atc,cin]",,,2020-12-11,"[newplasmend2,newplasmend1]",plasbc1,plasbc2,genomechunk,
179,201212,npbc8,,REC49,i70,i71-2,UintF,chlorR,"5,[sal1,atc,ara]","25,[sal1,atc,cin]",,,2020-12-11,"[newplasmend2,newplasmend1]",plasbc1,plasbc2,genomechunk,
180,201212,npbc9,,REC49,i70,i71-2,UintF,chlorR,"5,[sal1]","25,[sal1,atc,ara]",,,2020-12-11,"[newplasmend2,newplasmend1]",plasbc1,plasbc2,genomechunk,


In [None]:
import pickle

statsfile = os.path.join("/",datapath,"alldata.pickle")
with open(statsfile,'rb') as saved_file:
    statsdf = pickle.load(saved_file)
statsdf['dataname'] = statsdf.dataname.astype('str')
    
for dataname in statsdf.dataname:
    datanamestr = str(dataname)
    plist = '_'.join(list(df_data[df_data.date_sequenced==datanamestr].primerf.unique()))
    statsdf.loc[statsdf.dataname==dataname,"primerf"]=plist
    plist = '_'.join(list(df_data[df_data.date_sequenced==datanamestr].primerr.unique()))
    statsdf.loc[statsdf.dataname==dataname,"primerr"]=plist
statsdf

In [None]:
import matplotlib.pyplot as plt


plt.bar(statsdf.dataname,height = statsdf.forward, align="center",label='foward')

plt.bar(statsdf.dataname,height = statsdf.reverse,bottom = statsdf.forward, align="center",label='reverse')

plt.bar(statsdf.dataname,height = statsdf.unknown,bottom = statsdf.forward+statsdf.reverse, align="center",label='unknown')
plt.xticks(rotation=80)
plt.legend()
plt.show()

In [None]:
datasets = list(df_data.date_sequenced.unique())

for dataset in datasets:
    print("data = "+dataset)
    df_dataset = df_data[df_data.date_sequenced==dataset]
    barcodes = list(df_dataset.barcode.unique())
    for bc in barcodes:
        df_bc = df_dataset[df_dataset.barcode==bc]
        conditions = range(1,5)
        induction_df = pd.DataFrame(columns=["time","inducer","inducer_conc"])
        time_acc = 0
        for condition in conditions:
            cond_str = "c"+str(condition)
            cond_data = df_bc[cond_str].iloc[0]
            if(pd.isna(cond_data)):
                break
            else:
                cond_split = cond_data.split(",",maxsplit=1)
                cond_split[1] = cond_split[1].replace("[","").replace("]","").split(",")
                time_hr = int(cond_split[0])
                time_acc += time_hr
                print(cond_split)
                for inducer in cond_split[1]:
                    
                    #this part goes through the inducer df to get the right inducer concentration
                    if(inducer==""):
                        #this means we induced with nothing
                        induction_df = induction_df.append(pd.DataFrame([[time_acc,"none",0]],\
                                                                columns=["time","inducer","inducer_conc"]))
                    else:
                        print("getting concentration")
                        print(df_inducers[df_inducers.inducername == inducer])
                        print(int(dataset.split("_")[0]))
                        print(df_inducers[df_inducers.inducername == inducer][int(dataset.split("_")[0])].iloc[0])
                        ind_concentration = df_inducers[df_inducers.inducername == inducer][int(dataset.split("_")[0])].iloc[0]
                        inducername = inducer
                        if(inducer[-1] in '1234567890'):
                            #if the inducer has a number at the end of it, take that off for the inducer's name
                            inducername = inducer[:-1]

                        induction_df = induction_df.append(pd.DataFrame([[time_acc,inducername,ind_concentration]],\
                                                                    columns=["time","inducer","inducer_conc"]))
                
    print("barcodes = "+str(barcodes))
