Decrease "chunk_size" variable in duplex_caller before running tests to ensure the flushing system works properly

In [None]:
import pysam
from collections import defaultdict

In [None]:
!jupyter nbconvert --to script ../../group_umis.ipynb
!python3.7 ../../group_umis.py -h

In [None]:
def test(exp_file, act_file):
    true_umis = dict()
    exp_groups = defaultdict(lambda: [])
    with open(exp_file, 'r') as f:
        for l in f:
            row = l.strip().split('\t')
            true_umis[row[0]] = row[2]
            exp_groups[row[1]].append(row[0])
    exp_groups = sorted(exp_groups.values())

    aln = pysam.AlignmentFile(act_file, 'rb')
    act_groups = defaultdict(lambda: [])
    read_info = dict()
    for read in aln.fetch(until_eof=True):
        if true_umis[read.query_name] != read.get_tag('RX'):
            print(f'ERROR: expected true UMI {true_umis[read.query_name]} but received {read.get_tag("RX")} for read {read.query_name}')
        if read.is_reverse:
            assert read.query_name in act_groups[read.get_tag('UG')]
            continue
        read_info[read.query_name] = (read.reference_start, read.tlen, read.query_name.rsplit(':')[-1])
        act_groups[read.get_tag('UG')].append(read.query_name)
    aln.close()
    act_groups = sorted(act_groups.values())
    if act_groups != exp_groups:
        print(f'ERROR: incorrect UMI groupings')
        print('Expected')
        for group in exp_groups:
            print('|' + '; '.join([str(read_info[r]) for r in group]))
    print('Received')
    for group in act_groups:
        print('|' + '; '.join([str(read_info[r]) for r in group]))


In [None]:
'''default test case, covers:
- grouping umis with one difference but not two
- grouping umis with 2bp difference in template position but not 3bp
- assigning the correct umi for the RX tag
- discarding of improperly mapped reads
- flipping of f2r1 umis
'''
!python3.7 ../../group_umis.py -f -b 2 'test.bam' 'out.bam'
test('expected_umis.txt', 'out.bam')

In [None]:
# same test file, but making sure nothing goes wrong when the buffer is flushed
!python3.7 ../../group_umis.py -f -b 2 --buffer_size 1 'test.bam' 'out.bam'
test('expected_umis.txt', 'out.bam')

In [None]:
# same test file, but making sure nothing goes wrong when multiprocessing
!python3.7 ../../group_umis.py -@ 4 -f -b 2 'test.bam' 'out.bam'
test('expected_umis.txt', 'out.bam')

In [None]:
# tests that it doesn't choke when grouping a big network of reads
!python3.7 ../../group_umis.py -f -b 500 --buffer_size 1 'test_messy_group.bam' 'out.bam'
test('expected_umis_messy_group.txt', 'out.bam')