In [1]:
#flu selections

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

import os
import matplotlib
import seaborn as sns

from Bio import SeqIO
from Bio.Seq import Seq

import warnings
import itertools
import numpy as np

import tqdm
import gc

import scipy as sp

In [2]:
warnings.simplefilter('ignore')

In [10]:
#load in the sample information table
samples=pd.read_csv('data/samples.csv')
samples

Unnamed: 0,sample_name,virus_background,replicate,selection,sample_id,Lane,inline_fwd,inline_rev,read_1_file,read_2_file
0,USSR77_R1_NoAb_L1,USSR77,R1,NoAb,1,1,TTGATCG,TACCTGA,data/sequencing_data/NoAb_USSR77_1_S1_L001_R1_...,data/sequencing_data/NoAb_USSR77_1_S1_L001_R2_...
1,USSR77_R1_NoAb_L2,USSR77,R1,NoAb,1,2,TTGATCG,TACCTGA,data/sequencing_data/NoAb_USSR77_1_S1_L002_R1_...,data/sequencing_data/NoAb_USSR77_1_S1_L002_R2_...
2,USSR77_R2_NoAb_L1,USSR77,R2,NoAb,2,1,CATTCTCGAG,TACCTGA,data/sequencing_data/NoAb_USSR77_2_S2_L001_R1_...,data/sequencing_data/NoAb_USSR77_2_S2_L001_R2_...
3,USSR77_R2_NoAb_L2,USSR77,R2,NoAb,2,2,CATTCTCGAG,TACCTGA,data/sequencing_data/NoAb_USSR77_2_S2_L002_R1_...,data/sequencing_data/NoAb_USSR77_2_S2_L002_R2_...
4,SN89_R1_NoAb_L1,SN89,R1,NoAb,3,1,CCTTAGTACTA,TACCTGA,data/sequencing_data/NoAb_SN89_1_S3_L001_R1_00...,data/sequencing_data/NoAb_SN89_1_S3_L001_R2_00...
...,...,...,...,...,...,...,...,...,...,...
123,SYD21_R2_ADI-77470_L2,SYD21,R2,ADI-77470,62,2,GATGTCCT,GAGATCAC,data/sequencing_data/ADI-77470_SYD21_2_S62_L00...,data/sequencing_data/ADI-77470_SYD21_2_S62_L00...
124,SYD21_R1_ADI-86325_L1,SYD21,R1,ADI-86325,63,1,CTCATCCAG,GAGATCAC,data/sequencing_data/ADI-86325_SYD21_1_S63_L00...,data/sequencing_data/ADI-86325_SYD21_1_S63_L00...
125,SYD21_R1_ADI-86325_L2,SYD21,R1,ADI-86325,63,2,CTCATCCAG,GAGATCAC,data/sequencing_data/ADI-86325_SYD21_1_S63_L00...,data/sequencing_data/ADI-86325_SYD21_1_S63_L00...
126,SYD21_R2_ADI-86325_L1,SYD21,R2,ADI-86325,64,1,GACGGAACTC,GAGATCAC,data/sequencing_data/ADI-86325_SYD21_2_S64_L00...,data/sequencing_data/ADI-86325_SYD21_2_S64_L00...


In [6]:
#run PEAR for paired end alignments for all samples
for idx,row in samples.iterrows():
    sample_name = row['sample_name']
    fwd_read = row['read_1_file']
    rev_read = row['read_2_file']
    os.system(f'pear -f {fwd_read} -r {rev_read} -o "data/sequencing_data/pear_output/"{sample_name} -j 6')

 ____  _____    _    ____ 
|  _ \| ____|  / \  |  _ \
| |_) |  _|   / _ \ | |_) |
|  __/| |___ / ___ \|  _ <
|_|   |_____/_/   \_\_| \_\

PEAR v0.9.6 [January 15, 2015]

Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR
Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593

Forward reads file.................: data/sequencing_data/CH65_ND_SN89_1_S15_L001_R1_001.fastq.gz
Reverse reads file.................: data/sequencing_data/CH65_ND_SN89_1_S15_L001_R2_001.fastq.gz
PHRED..............................: 33
Using empirical frequencies........: YES
Statistical method.................: OES
Maximum assembly length............: 999999
Minimum assembly length............: 50
p-value............................: 0.010000
Quality score threshold (trimming).: 0
Minimum read size after trimming...: 1
Maximal ratio of uncalled bases....: 1.000000
Minimum overlap....................: 10
Scoring method.....................: Scaled score
Threads......

In [15]:
# for each sample, extract barcode and output with the number of counts based on umis
flanks=['AGCCTCCAGTGTAGGATTTGCATTTAA','TGGTGCTTTTGGTCTCCCTGGGG']

for index,row in samples.iterrows():
    gc.collect()
    sample_name = row['sample_name']
    virus_background = row['virus_background']
    replicate = row['replicate']
    selection = row['selection']
    inline_fwd = row['inline_fwd']
    inline_rev = row['inline_rev']
    inline_rev_rcomp = str(Seq(inline_rev).reverse_complement())
    
    print(f'Now processing: {sample_name}')
    
    required_length = 78 + 16 + len(inline_fwd) + len(inline_rev)
    barcode_umi_counts = dict()
    for record in tqdm.tqdm(SeqIO.parse(f'data/sequencing_data/pear_output/{sample_name}.assembled.fastq', 'fastq')):
        seq_str = str(record.seq)
        if len(seq_str) == required_length and 'N' not in seq_str and inline_fwd in seq_str[8:8+len(inline_fwd)] and inline_rev_rcomp in seq_str[-8-len(inline_rev_rcomp):-8]:
            UMI_merged = seq_str[:8]+seq_str[-8:]
            if flanks[0] in seq_str and flanks[1] in seq_str:
                barcode = seq_str.split(flanks[0])[1].split(flanks[1])[0]                   
            elif flanks[0] in seq_str and flanks[1] not in seq_str:
                barcode = seq_str.split(flanks[0])[1][:28]
            elif flanks[0] not in seq_str and flanks[1] in seq_str:
                barcode = seq_str.split(flanks[1])[0][-28:]
                
            barcode_umi_counts.setdefault(barcode, dict())
            barcode_umi_counts[barcode].setdefault(UMI_merged, 0)
            barcode_umi_counts[barcode][UMI_merged] += 1
            
    data = {'barcode':[], 'umi':[], 'count':[]}
    for barcode, umis in barcode_umi_counts.items():
        for umi, count in umis.items():
            data['barcode'].append(barcode)
            data['umi'].append(umi)
            data['count'].append(count)
    data = pd.DataFrame(data)
    #data['sample_name'] = row['sample_name']
    data['sample'] = row['virus_background']+'_'+row['replicate']+'_'+row['selection']
    data['virus_background'] = row['virus_background']
    data['replicate'] = row['replicate']
    data['selection'] = row['selection']
    
    print("Dataframe length:",len(data))
    
    data.to_csv(f'data/barcode_umi_counts/{sample_name}_barcodes_umis.csv')

Now processing: USSR77_R1_NoAb_L1


6829711it [01:12, 94400.97it/s]


Dataframe length: 5716580
Now processing: USSR77_R1_NoAb_L2


6865796it [01:14, 92479.21it/s]


Dataframe length: 5772280
Now processing: USSR77_R2_NoAb_L1


6893867it [01:11, 96949.93it/s] 


Dataframe length: 3741073
Now processing: USSR77_R2_NoAb_L2


6973592it [01:11, 97704.97it/s] 


Dataframe length: 3797718
Now processing: SN89_R1_NoAb_L1


5363075it [00:57, 93582.52it/s]


Dataframe length: 4289571
Now processing: SN89_R1_NoAb_L2


5380523it [00:58, 92415.31it/s]


Dataframe length: 4321011
Now processing: SN89_R2_NoAb_L1


6191720it [01:07, 91712.94it/s]


Dataframe length: 5264238
Now processing: SN89_R2_NoAb_L2


6250583it [01:08, 91777.82it/s]


Dataframe length: 5335548
Now processing: SI06_R1_NoAb_L1


5959159it [01:03, 94492.79it/s]


Dataframe length: 4936910
Now processing: SI06_R1_NoAb_L2


5982324it [01:01, 96598.93it/s]


Dataframe length: 4977269
Now processing: SI06_R2_NoAb_L1


6686829it [01:10, 95384.77it/s]


Dataframe length: 5539093
Now processing: SI06_R2_NoAb_L2


6718675it [01:11, 94581.40it/s]


Dataframe length: 5590832
Now processing: SYD21_R1_NoAb_L1


6324477it [01:06, 95085.40it/s]


Dataframe length: 5047907
Now processing: SYD21_R1_NoAb_L2


6352577it [01:06, 94916.21it/s]


Dataframe length: 5088167
Now processing: SYD21_R2_NoAb_L1


9017132it [01:36, 93707.06it/s]


Dataframe length: 6914971
Now processing: SYD21_R2_NoAb_L2


9069137it [01:37, 93053.43it/s]


Dataframe length: 6984440
Now processing: SN89_R1_UCA860_L1


5442648it [00:57, 93924.92it/s]


Dataframe length: 4609688
Now processing: SN89_R1_UCA860_L2


5462779it [00:58, 93861.92it/s]


Dataframe length: 4646767
Now processing: SN89_R2_UCA860_L1


5715432it [00:57, 99377.49it/s] 


Dataframe length: 3160306
Now processing: SN89_R2_UCA860_L2


5755300it [00:57, 99546.57it/s] 


Dataframe length: 3192506
Now processing: SN89_R1_CH65_L1


4748761it [00:51, 92480.87it/s]


Dataframe length: 3667962
Now processing: SN89_R1_CH65_L2


4789123it [00:51, 92605.18it/s]


Dataframe length: 3711900
Now processing: SN89_R2_CH65_L1


4740220it [00:51, 91663.75it/s]


Dataframe length: 3797294
Now processing: SN89_R2_CH65_L2


4758066it [00:52, 90942.08it/s]


Dataframe length: 3826919
Now processing: SI06_R1_CH65_L1


5774210it [01:01, 94361.70it/s]


Dataframe length: 4848301
Now processing: SI06_R1_CH65_L2


5806877it [01:01, 94625.91it/s]


Dataframe length: 4896494
Now processing: SI06_R2_CH65_L1


6569161it [01:09, 94161.95it/s]


Dataframe length: 5541981
Now processing: SI06_R2_CH65_L2


6625470it [01:10, 94232.17it/s]


Dataframe length: 5614149
Now processing: SN89_R1_CH65_ND_L1


6002088it [01:04, 93646.00it/s]


Dataframe length: 4890008
Now processing: SN89_R1_CH65_ND_L2


6032240it [01:04, 93799.49it/s]


Dataframe length: 4934114
Now processing: SN89_R2_CH65_ND_L1


7928128it [01:24, 93513.54it/s]


Dataframe length: 5985016
Now processing: SN89_R2_CH65_ND_L2


8005879it [01:25, 93804.71it/s]


Dataframe length: 6066835
Now processing: SN89_R1_UCA652_L1


5454224it [00:57, 94209.85it/s]


Dataframe length: 4673963
Now processing: SN89_R1_UCA652_L2


5482993it [00:58, 94236.64it/s]


Dataframe length: 4720381
Now processing: SN89_R2_UCA652_L1


5646721it [00:56, 99308.95it/s] 


Dataframe length: 3125693
Now processing: SN89_R2_UCA652_L2


5679184it [00:57, 99395.82it/s] 


Dataframe length: 3159988
Now processing: SN89_R1_H2227_L1


5145290it [00:55, 93183.85it/s]


Dataframe length: 3964065
Now processing: SN89_R1_H2227_L2


5165928it [00:55, 93121.64it/s]


Dataframe length: 3995238
Now processing: SN89_R2_H2227_L1


8621236it [01:33, 92329.04it/s]


Dataframe length: 6469882
Now processing: SN89_R2_H2227_L2


8663115it [01:33, 92348.90it/s]


Dataframe length: 6532213
Now processing: SI06_R1_H2227_L1


7652160it [01:20, 94729.61it/s]


Dataframe length: 6362185
Now processing: SI06_R1_H2227_L2


7696773it [01:22, 93468.78it/s]


Dataframe length: 6428090
Now processing: SI06_R2_H2227_L1


4748896it [00:50, 93545.33it/s]


Dataframe length: 4064486
Now processing: SI06_R2_H2227_L2


4762824it [00:50, 93894.30it/s]


Dataframe length: 4096242
Now processing: SN89_R1_H1244_L1


5281896it [00:56, 93571.83it/s]


Dataframe length: 4316676
Now processing: SN89_R1_H1244_L2


5292322it [00:56, 93182.10it/s]


Dataframe length: 4342251
Now processing: SN89_R2_H1244_L1


5667215it [01:01, 92247.46it/s]


Dataframe length: 4605143
Now processing: SN89_R2_H1244_L2


5727304it [01:01, 93223.85it/s]


Dataframe length: 4668405
Now processing: SI06_R1_H1244_L1


5320321it [00:57, 93295.27it/s]


Dataframe length: 4327863
Now processing: SI06_R1_H1244_L2


5403818it [00:57, 93364.48it/s]


Dataframe length: 4408766
Now processing: SI06_R2_H1244_L1


6715780it [01:08, 98269.94it/s] 


Dataframe length: 3013484
Now processing: SI06_R2_H1244_L2


6750315it [01:08, 98702.16it/s] 


Dataframe length: 3038971
Now processing: SN89_R1_H1825_L1


4073207it [00:44, 92117.05it/s]


Dataframe length: 3368124
Now processing: SN89_R1_H1825_L2


4086150it [00:44, 92420.01it/s]


Dataframe length: 3393208
Now processing: SN89_R2_H1825_L1


5396632it [00:59, 89951.46it/s]


Dataframe length: 4363013
Now processing: SN89_R2_H1825_L2


5436393it [01:00, 89833.06it/s]


Dataframe length: 4413163
Now processing: SN89_R1_H679_L1


5316978it [00:58, 91443.42it/s]


Dataframe length: 4257276
Now processing: SN89_R1_H679_L2


5361757it [00:58, 90999.43it/s]


Dataframe length: 4307507
Now processing: SN89_R2_H679_L1


5208836it [00:56, 92409.37it/s]


Dataframe length: 4347551
Now processing: SN89_R2_H679_L2


5248074it [00:55, 94084.25it/s]


Dataframe length: 4396869
Now processing: SN89_R1_UCA641_L1


5312491it [00:57, 92630.10it/s]


Dataframe length: 3704659
Now processing: SN89_R1_UCA641_L2


5352838it [00:58, 92287.92it/s]


Dataframe length: 3739609
Now processing: SN89_R2_UCA641_L1


5533990it [00:59, 92441.00it/s]


Dataframe length: 4431491
Now processing: SN89_R2_UCA641_L2


5573219it [01:00, 92623.17it/s]


Dataframe length: 4480400
Now processing: SN89_R1_H1270_L1


6109262it [01:05, 93069.89it/s]


Dataframe length: 5043337
Now processing: SN89_R1_H1270_L2


6180978it [01:06, 93592.12it/s]


Dataframe length: 5122235
Now processing: SN89_R2_H1270_L1


4945759it [00:50, 98683.60it/s] 


Dataframe length: 2626099
Now processing: SN89_R2_H1270_L2


4968712it [00:50, 98851.58it/s] 


Dataframe length: 2646613
Now processing: SN89_R1_H1261_L1


5888334it [01:04, 91316.44it/s]


Dataframe length: 4601215
Now processing: SN89_R1_H1261_L2


5951506it [01:04, 92405.76it/s]


Dataframe length: 4665157
Now processing: SN89_R2_H1261_L1


5713722it [01:02, 91088.67it/s]


Dataframe length: 4758045
Now processing: SN89_R2_H1261_L2


5744285it [01:02, 91231.07it/s]


Dataframe length: 4803573
Now processing: USSR77_R1_Ab6649_L1


6252200it [01:06, 93399.38it/s]


Dataframe length: 5058695
Now processing: USSR77_R1_Ab6649_L2


6288389it [01:07, 93482.80it/s]


Dataframe length: 5109486
Now processing: USSR77_R2_Ab6649_L1


7733540it [01:22, 93452.80it/s]


Dataframe length: 6161003
Now processing: USSR77_R2_Ab6649_L2


7756393it [01:22, 93544.87it/s]


Dataframe length: 6204357
Now processing: SN89_R1_Ab6649_L1


5719740it [01:01, 92685.44it/s]


Dataframe length: 4339359
Now processing: SN89_R1_Ab6649_L2


5758962it [01:02, 92344.71it/s]


Dataframe length: 4382908
Now processing: SN89_R2_Ab6649_L1


13286368it [02:23, 92334.94it/s]


Dataframe length: 10605744
Now processing: SN89_R2_Ab6649_L2


13414151it [02:26, 91629.30it/s]


Dataframe length: 10734577
Now processing: SI06_R1_Ab6649_L1


3371733it [00:35, 93704.64it/s]


Dataframe length: 2849700
Now processing: SI06_R1_Ab6649_L2


3381691it [00:35, 94262.85it/s]


Dataframe length: 2873501
Now processing: SI06_R2_Ab6649_L1


4823524it [00:48, 99413.38it/s] 


Dataframe length: 2528762
Now processing: SI06_R2_Ab6649_L2


4858615it [00:49, 98735.66it/s] 


Dataframe length: 2558441
Now processing: USSR77_R1_Ab9207_L1


6974673it [01:15, 92714.22it/s]


Dataframe length: 5404596
Now processing: USSR77_R1_Ab9207_L2


6994344it [01:15, 92601.78it/s]


Dataframe length: 5450164
Now processing: USSR77_R2_Ab9207_L1


5538338it [01:01, 90619.82it/s]


Dataframe length: 4498920
Now processing: USSR77_R2_Ab9207_L2


5572271it [01:00, 91557.48it/s]


Dataframe length: 4547724
Now processing: SN89_R1_Ab9207_L1


5275843it [00:56, 92759.80it/s]


Dataframe length: 4227281
Now processing: SN89_R1_Ab9207_L2


5305958it [00:57, 92805.70it/s]


Dataframe length: 4265773
Now processing: SN89_R2_Ab9207_L1


7089440it [01:16, 92084.11it/s]


Dataframe length: 5519223
Now processing: SN89_R2_Ab9207_L2


7124155it [01:17, 92051.47it/s]


Dataframe length: 5569811
Now processing: USSR77_R1_ADI-77474_L1


5592721it [01:00, 92315.56it/s]


Dataframe length: 4639083
Now processing: USSR77_R1_ADI-77474_L2


5628022it [01:00, 92527.44it/s]


Dataframe length: 4684028
Now processing: USSR77_R2_ADI-77474_L1


5788575it [01:02, 92955.58it/s]


Dataframe length: 4630945
Now processing: USSR77_R2_ADI-77474_L2


5824113it [01:02, 93089.33it/s]


Dataframe length: 4684412
Now processing: SN89_R1_ADI-77474_L1


4071592it [00:42, 95956.04it/s]


Dataframe length: 3494844
Now processing: SN89_R1_ADI-77474_L2


4097011it [00:43, 95222.25it/s]


Dataframe length: 3534365
Now processing: SN89_R2_ADI-77474_L1


6336946it [01:03, 100451.05it/s]


Dataframe length: 3434296
Now processing: SN89_R2_ADI-77474_L2


6370496it [01:03, 99997.22it/s] 


Dataframe length: 3469086
Now processing: SI06_R1_ADI-77474_L1


5882002it [01:02, 93438.48it/s]


Dataframe length: 4860966
Now processing: SI06_R1_ADI-77474_L2


5900433it [01:02, 94297.27it/s]


Dataframe length: 4898020
Now processing: SI06_R2_ADI-77474_L1


5887700it [01:03, 93067.44it/s]


Dataframe length: 4902028
Now processing: SI06_R2_ADI-77474_L2


5927930it [01:03, 93757.42it/s]


Dataframe length: 4960927
Now processing: SYD21_R1_ADI-77474_L1


7954015it [01:21, 97980.56it/s] 


Dataframe length: 5884302
Now processing: SYD21_R1_ADI-77474_L2


7988882it [01:21, 98055.24it/s] 


Dataframe length: 5931639
Now processing: SYD21_R2_ADI-77474_L1


6341754it [01:06, 94916.41it/s]


Dataframe length: 5104510
Now processing: SYD21_R2_ADI-77474_L2


6369081it [01:07, 93992.53it/s]


Dataframe length: 5145593
Now processing: USSR77_R1_ADI-77470_L1


6570478it [01:09, 94249.84it/s]


Dataframe length: 5585965
Now processing: USSR77_R1_ADI-77470_L2


6596832it [01:09, 94497.66it/s]


Dataframe length: 5632156
Now processing: USSR77_R2_ADI-77470_L1


5672853it [00:59, 94738.23it/s]


Dataframe length: 4632751
Now processing: USSR77_R2_ADI-77470_L2


5735511it [01:00, 94809.41it/s]


Dataframe length: 4700675
Now processing: SN89_R1_ADI-77470_L1


5423316it [00:57, 95068.33it/s]


Dataframe length: 4646973
Now processing: SN89_R1_ADI-77470_L2


5478454it [00:57, 95017.00it/s]


Dataframe length: 4712159
Now processing: SN89_R2_ADI-77470_L1


4614421it [00:45, 100576.83it/s]


Dataframe length: 2798114
Now processing: SN89_R2_ADI-77470_L2


4466699it [00:44, 99729.76it/s] 


Dataframe length: 2724464
Now processing: SI06_R1_ADI-77470_L1


5751655it [01:01, 93532.95it/s]


Dataframe length: 4909301
Now processing: SI06_R1_ADI-77470_L2


5896131it [01:02, 93675.15it/s]


Dataframe length: 5043447
Now processing: SI06_R2_ADI-77470_L1


5839859it [01:02, 93306.28it/s]


Dataframe length: 4761358
Now processing: SI06_R2_ADI-77470_L2


5896883it [01:03, 92841.40it/s]


Dataframe length: 4823373
Now processing: SYD21_R1_ADI-77470_L1


8267071it [01:26, 95287.31it/s]


Dataframe length: 6832979
Now processing: SYD21_R1_ADI-77470_L2


8315856it [01:26, 95677.23it/s]


Dataframe length: 6902436
Now processing: SYD21_R2_ADI-77470_L1


5748929it [01:00, 95049.25it/s]


Dataframe length: 4905826
Now processing: SYD21_R2_ADI-77470_L2


5771594it [01:00, 95320.72it/s]


Dataframe length: 4944541
Now processing: SYD21_R1_ADI-86325_L1


7947295it [01:24, 94474.73it/s]


Dataframe length: 6345235
Now processing: SYD21_R1_ADI-86325_L2


7996145it [01:24, 94403.06it/s]


Dataframe length: 6416084
Now processing: SYD21_R2_ADI-86325_L1


4647522it [00:49, 93800.52it/s]


Dataframe length: 3906486
Now processing: SYD21_R2_ADI-86325_L2


4664855it [00:49, 93551.79it/s]


Dataframe length: 3941819


In [16]:
#generate sample list
sample_list=[]
for idx,row in samples.iterrows():
    sample=row['sample_name'][:-3]
    if sample in sample_list:
        continue
    if sample not in sample_list:
        sample_list.append(sample)

In [17]:
#for each sample, merge data from both lanes of novaseq sp, and dropping duplicates of the same barcode_umi combination
#uncomment elif statements if data with only single lanes is being used

all_data_df=pd.DataFrame()

for f in tqdm.tqdm(sample_list):
    gc.collect()
    file_lane1 = f+'_L1_barcodes_umis.csv'
    file_lane2 = f+'_L2_barcodes_umis.csv'
    
    if file_lane1 and file_lane2 in os.listdir('data/barcode_umi_counts'):  
        temp_df1 = pd.read_csv(f'data/barcode_umi_counts/{file_lane1}')
        temp_df2 = pd.read_csv(f'data/barcode_umi_counts/{file_lane2}')
        temp_df = pd.concat([temp_df1,temp_df2])
        temp_df['barcode_umi']=temp_df['barcode']+'_'+temp_df['umi']
        temp_df = temp_df.drop_duplicates(subset=['barcode_umi','sample'],keep='first')
        temp_df['bc_count']=1
        temp_df_agg = temp_df.groupby(['virus_background','replicate','selection','sample','barcode']).agg({'bc_count':'sum'}).reset_index()
        all_data_df=pd.concat([all_data_df,temp_df_agg]).reset_index(drop=True)
    
    #elif file_lane1 in os.listdir('data/barcode_umi_counts') and file_lane2 not in os.listdir('data/barcode_umi_counts'):  
        #temp_df = pd.read_csv(f'data/barcode_umi_counts/{file_lane1}')
        #temp_df['bc_count']=1
        #temp_df['barcode_umi']=temp_df['barcode']+'_'+temp_df['umi']
        #temp_df_agg = temp_df.groupby(['virus_background','replicate','selection','sample','barcode']).agg({'bc_count':'sum'}).reset_index()
        #all_data_df=pd.concat([all_data_df,temp_df_agg]).reset_index(drop=True)
        
    #elif file_lane2 in os.listdir('data/barcode_umi_counts') and file_lane1 not in os.listdir('data/barcode_umi_counts'):  
        #temp_df = pd.read_csv(f'data/barcode_umi_counts/{file_lane2}')
        #temp_df['bc_count']=1
        #temp_df['barcode_umi']=temp_df['barcode']+'_'+temp_df['umi']
        #temp_df_agg = temp_df.groupby(['virus_background','replicate','selection','sample','barcode']).agg({'bc_count':'sum'}).reset_index()
        #all_data_df=pd.concat([all_data_df,temp_df_agg]).reset_index(drop=True)
    
    else:
        continue

all_data_df.to_csv('data/all_data_df.csv',index=False)
all_data_df

100%|██████████| 64/64 [16:55<00:00, 15.87s/it]


Unnamed: 0,virus_background,replicate,selection,sample,barcode,bc_count
0,USSR77,R1,NoAb,USSR77_R1_NoAb,AAAAAAAGCTCGAGCTTGGCCGTACAGT,3
1,USSR77,R1,NoAb,USSR77_R1_NoAb,AAAAACGGCGATAGCTGAAAAATTCACG,1
2,USSR77,R1,NoAb,USSR77_R1_NoAb,AAAACATGCCATAGCTGAAAACGATACT,1
3,USSR77,R1,NoAb,USSR77_R1_NoAb,ACAAAAAGCTCGAGCTACTGCAGGCGCT,2
4,USSR77,R1,NoAb,USSR77_R1_NoAb,ACAAAAAGCTCGAGCTATTAACAAGTCC,1
...,...,...,...,...,...,...
2755098,SYD21,R2,ADI-86325,SYD21_R2_ADI-86325,TTGTGTTGACATAGCTCCTGATCAGGTC,1
2755099,SYD21,R2,ADI-86325,SYD21_R2_ADI-86325,TTGTGTTGCTCAAGCTCGAAGTTTAGAG,1
2755100,SYD21,R2,ADI-86325,SYD21_R2_ADI-86325,TTGTGTTGCTCAAGCTGCGGTTACCGGC,3
2755101,SYD21,R2,ADI-86325,SYD21_R2_ADI-86325,TTGTGTTGCTCAAGCTGTATGTCGGCTT,1
