In [None]:
!sudo apt-get update -y --quiet

In [None]:
!sudo apt-get install -y --quiet tree libcurl4-openssl-dev zlib1g-dev samtools

In [3]:
!pip install biopython
!pip install pyBigWig
!pip install pysam

Collecting biopython
  Downloading biopython-1.78-cp38-cp38-manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 2.1 MB/s eta 0:00:01
[?25hCollecting numpy
  Downloading numpy-1.19.4-cp38-cp38-manylinux2010_x86_64.whl (14.5 MB)
[K     |████████████████████████████████| 14.5 MB 19.1 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy, biopython
Successfully installed biopython-1.78 numpy-1.19.4
Collecting pyBigWig
  Downloading pyBigWig-0.3.17.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 1.8 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyBigWig
  Building wheel for pyBigWig (setup.py) ... [?25ldone
[?25h  Created wheel for pyBigWig: filename=pyBigWig-0.3.17-cp38-cp38-linux_x86_64.whl size=223778 sha256=925ec2afd74b53164c0f7f682f597dc2ead978d690168549b32b5215dae8fa17
  Stored in directory: /home/jovyan/.cache/pip/wheels/6c/0f/66/71631d173ebddcb890b2b36cbab9325ed4913fd1d583c273ed
Successfully built pyBigWig
Insta

In [4]:
import os
import math
import numpy as np
import pysam

from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqIO.QualityIO import FastqGeneralIterator

In [5]:
def bold_char(string, pos):
    string_list = list(string)
    BOLD = '\033[1m'
    END = '\033[0m'
    string_list.insert(pos, BOLD)
    string_list.insert(pos+2, END)
    return("".join(string_list))

def sam_flag_decoder(flag, logging=True):
    description = []
    for bit in range(12):
        bit_description = ""
        flag_component = flag & (1<<bit) # check if a particular bit is set on the binary flag
        binary_string = '{0:012b}'.format(flag) # convert to binary and set length to 12 bits
        if(flag_component): binary_string = bold_char(binary_string, 11-bit) # bold the bit if is set
        if(flag_component and bit == 0):  
            bit_description = "read paired - template having multiple segments in sequencing"
            description.append("read paired - template having multiple segments in sequencing")
        if(flag_component and bit == 1):  
            bit_description = "read mapped in proper pair - each segment properly aligned according to the aligner"
            description.append("read mapped in proper pair - each segment properly aligned according to the aligner")
        if(flag_component and bit == 2):  
            bit_description = "read unmapped - segment unmapped"
            description.append("read unmapped - segment unmapped")
        if(flag_component and bit == 3):  
            bit_description = "mate unmapped - next segment in the template unmapped"
            description.append("mate unmapped - next segment in the template unmapped")
        if(flag_component and bit == 4):  
            bit_description = "read reverse strand - SEQ being reverse complemented"
            description.append("read reverse strand - SEQ being reverse complemented")
        if(flag_component and bit == 5):  
            bit_description = "mate reverse strand - SEQ of the next segment in the template being reversed"
            description.append("mate reverse strand - SEQ of the next segment in the template being reversed")
        if(flag_component and bit == 6):  
            bit_description = "first in pair - the first segment in the template"
            description.append("first in pair - the first segment in the template")
        if(flag_component and bit == 7):  
            bit_description = "second in pair - the last segment in the template"
            description.append("second in pair - the last segment in the template")
        if(flag_component and bit == 8):  
            bit_description = "not primary alignment - secondary alignment"
            description.append("not primary alignment - secondary alignment")  
        if(flag_component and bit == 9):  
            bit_description = "read fails platform/vendor quality checks - not passing quality controls"
            description.append("read fails platform/vendor quality checks - not passing quality controls")
        if(flag_component and bit == 10): 
            bit_description = "read is PCR or optical duplicate"
            description.append("read is PCR or optical duplicate")
        if(flag_component and bit == 11): 
            bit_description = "supplementary alignment"
            description.append("supplementary alignment")
        if(logging):
            if(bit_description): print(binary_string + "  " + "{:<4}".format(str(flag_component)) + "  " + description[-1])
            else: print(binary_string + "  " + "{:<4}".format(str(flag_component)))
    return(" + ".join(description))

In [6]:
def sam_flag_to_bin(flag):
    print("{:<4}".format(str(flag)) + ": " + '{0:012b}'.format(flag))
    print("{:<4}".format(str(4095-flag)) + ": " + '{0:012b}'.format(4095-flag))

## PairedReads
For paired reads, 0'th bit HAS to be set. Hence all flags for paired reads HAVE to be odd. In other words, all even-numbered flags other than the above three (0, 4 and 16) are meaningless. 

For paired reads all flags in the intervals [65-127] and [193-255] relate to the first read of a pair. All other (odd) flags refer to the second read in a pair.

### Both reads have aligned
- 65 - 0001000001 - this is first read in pair and both reads aligned the forward strand.
- 129 - 0010000001 - this is second read of pair and both reads aligned the forward strand.
- 67 (0001000011) and 131 (0010000011) also mean the same as 65 and 129 with the added assurance that "the pair is properly aligned" meaning that they mapped within a proper distance from each other.
- 113 - 0001110001 - "this is the first read of a pair, both reads in pair were flipped and both mapped".
- 177 - 0001110001 - "this is the second read of a pair, both reads in pair were flipped and both mapped".
- 81 - 0001010001 - "this is the first read of pair, both reads mapped, we had to flip this read, but mate is in forward orientation".
- 161 - 0010100001 - "this is second read, this one is forward but we flipped its mate and both reads mapped".
- 163 (0010100011) and 83 (0001010011) are the same as 161 and 81 except "it is in a proper pair".
- 97 - 0001100001 - "this is first read, its mate is flipped but this is forward. Both mapped".
- 145 - 0010010001 - "this is second read. it is flipped but its mate is not. Both mapped".
- 99 (0001100011) and 147 (0010010011) are the same as 97 and 145 except with "proper mapping in pair".

### No reads have aligned
- 77 - 0001001101 - First in pair, both reads in pair unmapped. "All bad"
- 141 - 0010001101 - Second in pair and "all bad".

### Only one read has aligned
- 69 - 0001000101 - 1st read in pair. This read is unmapped but its mate is mapped.
- 137 - 0010001001 - 2nd in pair. Read is mapped but mate is unmapped.
- 73 - 0001001001 - 1st read in pair. This read is mapped but its mate is not.
- 133 - 0010000101 - 2nd in pair. Read unmapped but mate is mapped.

## Checking BWA Alignment and BAM files

In [1]:
srr_name="SRR5090597"
donor_name="hpv16"
recipient_name="USCShg38"
inputs_folder="../workflows/cromwell-final-outputs-bwa-hpv16/"

In [2]:
!echo $srr_name-to-$donor_name
!echo $srr_name-to-$recipient_name

SRR5090597-to-hpv16
SRR5090597-to-USCShg38


In [3]:
# 3              MM
# 73 and 133     MU
# 69 nd 137      UM
# 77 and 141     UU

In [4]:
!samtools view -@ 24 -f 3 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

83
99
147
163
2115
2131
2147
2179
2195
2211
2227


In [5]:
!samtools view -@ 24 -f 3 -F 2048 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

83
99
147
163


In [56]:
!samtools view -@ 24 -f 73 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

73
77
121
2121
2169


In [114]:
!samtools view -@ 24 -f 73 -F 4022 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

73


In [57]:
!samtools view -@ 24 -f 133 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

133
141
181


In [113]:
!samtools view -@ 24 -f 133 -F 3962 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

133


In [58]:
!samtools view -@ 24 -f 73 $inputs_folder$srr_name-to-$donor_name".bam" | head -n 5

SRR5090597.3928639	73	NC_001526.4	5	60	72M1I28M	=	5	0	CTGATCCTGCAGGTACCAATGGGGAAGAGGGTACGGGATGTAATGGATGGTTTTATGTAGAGGCTGTAGTGGAAAAAAAAAACAGGGGATGCTATATCAGA	BBBFFFFFFFFFFFFIIIIIIIIIIFIFIIIBFIIIIBFFFFIIIIIIIIBFFFIIIFFIFIIFFFFFFBFFFBFBFBFFFF0<BBBF<BBFFFFFFFFFB	NM:i:1	MD:Z:100	AS:i:93	XS:i:0
SRR5090597.4449673	121	NC_001526.4	57	60	30M	=	57	0	TTATGTAGAGGCTGTAGTGGAAAAAAAAAC	BBFBFBIIFIFFBFBIIFFFFFFFFFFBBB	NM:i:0	MD:Z:30	AS:i:30	XS:i:0
SRR5090597.1096535	73	NC_001526.4	181	60	101M	=	181	0	CAGGCAGAAACAGAGACAGCACATGCGTTGTTTACTGCACAGGAAGCAAAACAACATAGAGATGCAGTACAGGTTCTAAAACGAAAGTATTTGGGTAGTCC	BBBFFFFFFFFFFIIIIIIIFBFFFFIFIIIIIFIIIIIIIIIFIIIIIIIIIIIFIIIIIIFFFFFFFFFFFBFBBFFFFFFFFFFBBFFFFFFBBBBFF	NM:i:0	MD:Z:101	AS:i:101	XS:i:0
SRR5090597.1309398	73	NC_001526.4	224	60	101M	=	224	0	AAGCAAAACAACATAGAGATGCAGTACAGGTTCTAAAACGAAAGTATTTGGGTAGTCCACTTAGTGATATTAGTGGATGTGTAGACAATAATATTAGTCCT	BBBFFFFFFFFFFFFFIIIIIIIIFFIIIIFIIIIIIIIIIIIIBFFFIFII0BBB<FFIIIIIFFFIIIFFF<BFFFFFFFBFFFFFFBBBFFFFFBBB<	NM:i:0	MD:Z:101	AS:

In [59]:
!samtools view -@ 24 -f 133 $inputs_folder$srr_name-to-$donor_name".bam" | head -n 5

SRR5090597.3928639	133	NC_001526.4	5	0	*	=	5	0	CTGTGTTAAAT	BBBFFFFFFFF	MC:Z:72M1I28M	AS:i:0	XS:i:0
SRR5090597.4449673	181	NC_001526.4	57	0	*	=	57	0	AGCAAAACAGAAG	<FFF<BFB<FBBB	MC:Z:30M	AS:i:0	XS:i:0
SRR5090597.1096535	133	NC_001526.4	181	0	*	=	181	0	GCTTCCCTTGTACAGTACTGAGGCTTACAGTCATAGTTCTATTACTTGTAACTTTTACACAGGTCACTGGCATTCTTAGTGCTTCTCTTAACACTACAGTA	BBBFFFFFFFFFFIIFIIIIIIIIIIIIIIFFFIIFFFFIIIIIIIIIFIFFIIIIIIFIIIIFFFFIIIIIIIIIIFIFFFFFFFFFFFFFFFFFFFBFF	MC:Z:101M	AS:i:0	XS:i:0
SRR5090597.1309398	133	NC_001526.4	224	0	*	=	224	0	CTCGACGCCAGGGCGCCGGGCCTTGTGGGCTGTGCTGCACCTCGGACGGCTTCGCACCAGCCAGCGCCCTCTCTCTCCTGCAGCACTCTGATCTGCACCCC	<BBFFFFFFFFFFFIIIIIIFIIIIIIIIIIFFFFFFFFFFFFFFBFBFB<<BFFFBFFB7BFFBFBFBBBFBFFFFFFFBFFBBB<BBBBB<B<BFB<0<	MC:Z:101M	AS:i:0	XS:i:0
SRR5090597.3466234	133	NC_001526.4	1396	0	*	=	1396	0	TTTTTTTTTTTTTT	<<<BBBBBBBBBBB	MC:Z:101M	AS:i:0	XS:i:0
samtools view: writing to standard output failed: No such file or directory
samtools view: error closing standard output: -1


In [60]:
sam_flag_to_bin(73)

73  : 000001001001
4022: 111110110110


In [61]:
sam_flag_to_bin(77)

77  : 000001001101
4018: 111110110010


In [62]:
sam_flag_decoder(77)

00000100110[1m1[0m  1     read paired - template having multiple segments in sequencing
000001001101  0   
000001001[1m1[0m01  4     read unmapped - segment unmapped
00000100[1m1[0m101  8     mate unmapped - next segment in the template unmapped
000001001101  0   
000001001101  0   
00000[1m1[0m001101  64    first in pair - the first segment in the template
000001001101  0   
000001001101  0   
000001001101  0   
000001001101  0   
000001001101  0   


'read paired - template having multiple segments in sequencing + read unmapped - segment unmapped + mate unmapped - next segment in the template unmapped + first in pair - the first segment in the template'

In [60]:
sam_flag_decoder(141)

00001000110[1m1[0m  1     read paired - template having multiple segments in sequencing
000010001101  0   
000010001[1m1[0m01  4     read unmapped - segment unmapped
00001000[1m1[0m101  8     mate unmapped - next segment in the template unmapped
000010001101  0   
000010001101  0   
000010001101  0   
0000[1m1[0m0001101  128   second in pair - the last segment in the template
000010001101  0   
000010001101  0   
000010001101  0   
000010001101  0   


'read paired - template having multiple segments in sequencing + read unmapped - segment unmapped + mate unmapped - next segment in the template unmapped + second in pair - the last segment in the template'

In [63]:
!samtools view -@ 24 -f 73 -F 4022 $inputs_folder$srr_name-to-$donor_name".bam" | head -n 5

SRR5090597.3928639	73	NC_001526.4	5	60	72M1I28M	=	5	0	CTGATCCTGCAGGTACCAATGGGGAAGAGGGTACGGGATGTAATGGATGGTTTTATGTAGAGGCTGTAGTGGAAAAAAAAAACAGGGGATGCTATATCAGA	BBBFFFFFFFFFFFFIIIIIIIIIIFIFIIIBFIIIIBFFFFIIIIIIIIBFFFIIIFFIFIIFFFFFFBFFFBFBFBFFFF0<BBBF<BBFFFFFFFFFB	NM:i:1	MD:Z:100	AS:i:93	XS:i:0
SRR5090597.1096535	73	NC_001526.4	181	60	101M	=	181	0	CAGGCAGAAACAGAGACAGCACATGCGTTGTTTACTGCACAGGAAGCAAAACAACATAGAGATGCAGTACAGGTTCTAAAACGAAAGTATTTGGGTAGTCC	BBBFFFFFFFFFFIIIIIIIFBFFFFIFIIIIIFIIIIIIIIIFIIIIIIIIIIIFIIIIIIFFFFFFFFFFFBFBBFFFFFFFFFFBBFFFFFFBBBBFF	NM:i:0	MD:Z:101	AS:i:101	XS:i:0
SRR5090597.1309398	73	NC_001526.4	224	60	101M	=	224	0	AAGCAAAACAACATAGAGATGCAGTACAGGTTCTAAAACGAAAGTATTTGGGTAGTCCACTTAGTGATATTAGTGGATGTGTAGACAATAATATTAGTCCT	BBBFFFFFFFFFFFFFIIIIIIIIFFIIIIFIIIIIIIIIIIIIBFFFIFII0BBB<FFIIIIIFFFIIIFFF<BFFFFFFFBFFFFFFBBBFFFFFBBB<	NM:i:0	MD:Z:101	AS:i:101	XS:i:0
SRR5090597.3466234	73	NC_001526.4	1396	60	101M	=	1396	0	GGCATACCTAAAAAAAATTGCATATTACTATATGGTGCAGCTAACACAGGTAAATCATTATTTGGTATGAGTTTA

In [64]:
!samtools view -@ 24 -f 133 -F 3962 $inputs_folder$srr_name-to-$donor_name".bam" | head -n 5

SRR5090597.3928639	133	NC_001526.4	5	0	*	=	5	0	CTGTGTTAAAT	BBBFFFFFFFF	MC:Z:72M1I28M	AS:i:0	XS:i:0
SRR5090597.1096535	133	NC_001526.4	181	0	*	=	181	0	GCTTCCCTTGTACAGTACTGAGGCTTACAGTCATAGTTCTATTACTTGTAACTTTTACACAGGTCACTGGCATTCTTAGTGCTTCTCTTAACACTACAGTA	BBBFFFFFFFFFFIIFIIIIIIIIIIIIIIFFFIIFFFFIIIIIIIIIFIFFIIIIIIFIIIIFFFFIIIIIIIIIIFIFFFFFFFFFFFFFFFFFFFBFF	MC:Z:101M	AS:i:0	XS:i:0
SRR5090597.1309398	133	NC_001526.4	224	0	*	=	224	0	CTCGACGCCAGGGCGCCGGGCCTTGTGGGCTGTGCTGCACCTCGGACGGCTTCGCACCAGCCAGCGCCCTCTCTCTCCTGCAGCACTCTGATCTGCACCCC	<BBFFFFFFFFFFFIIIIIIFIIIIIIIIIIFFFFFFFFFFFFFFBFBFB<<BFFFBFFB7BFFBFBFBBBFBFFFFFFFBFFBBB<BBBBB<B<BFB<0<	MC:Z:101M	AS:i:0	XS:i:0
SRR5090597.3466234	133	NC_001526.4	1396	0	*	=	1396	0	TTTTTTTTTTTTTT	<<<BBBBBBBBBBB	MC:Z:101M	AS:i:0	XS:i:0
SRR5090597.1000565	133	NC_001526.4	1862	0	*	=	1862	0	CGGCCAGACTGGCCTCAAACTCCTGACCTCATGATCCCTCCACCTTGGCCTCCCAAAGTGCTGGGATTACAGGTGTGAGTCACCGTGCCTTTTTTTTTTTT	BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFFIIIIIIIIIIIIIIFFFFFFFBBFFFBBF

In [89]:
!samtools view -@ 24 $inputs_folder$srr_name-to-$donor_name".bam" | grep 'SRR5090597.1011981\s' | head -n 5

SRR5090597.1011981	73	NC_001526.4	7571	60	101M	=	7571	0	ATCAAGAACACGTAGAGAAACCCAGCTGTAATCATGCATGGAGATACACCTACATTGCATGAATATATGTTAGATTTGCAACCAGAGACAACTGATCTCTA	BBBFFFFFFFFFFFIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIBFFFFFFIIIIFFIFIFFBFFFFFFBFFFFBBB<BFFBFFBBBB	NM:i:0	MD:Z:101	AS:i:101	XS:i:0
SRR5090597.1011981	133	NC_001526.4	7571	0	*	=	7571	0	TCGCCGTCGCTGAAAACATGGATCATCACTCGAGACAACGATTTCACATCGTCTTCGTTTTTGATGTCCAGTTTCCGAAGCATGCCTGCAGGATCAGCCAT	BBBFFFFFFFFFFIIIIIIIFFFIIIIFIIFIBFFIIIFIIIIFIFIFFIBFFFFFFFFFFFFFFFFFBFBFBFFFFFBBBFBBBBBBBBBFB<'7<BBB<	MC:Z:101M	AS:i:0	XS:i:0


In [90]:
!samtools view -@ 24 $inputs_folder$srr_name-to-$donor_name"_MU.bam" | grep 'SRR5090597.1011981\s' | head -n 5

SRR5090597.1011981	73	NC_001526.4	7571	60	101M	=	7571	0	ATCAAGAACACGTAGAGAAACCCAGCTGTAATCATGCATGGAGATACACCTACATTGCATGAATATATGTTAGATTTGCAACCAGAGACAACTGATCTCTA	BBBFFFFFFFFFFFIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIBFFFFFFIIIIFFIFIFFBFFFFFFBFFFFBBB<BFFBFFBBBB	NM:i:0	MD:Z:101	AS:i:101	XS:i:0
SRR5090597.1011981	133	NC_001526.4	7571	0	*	=	7571	0	TCGCCGTCGCTGAAAACATGGATCATCACTCGAGACAACGATTTCACATCGTCTTCGTTTTTGATGTCCAGTTTCCGAAGCATGCCTGCAGGATCAGCCAT	BBBFFFFFFFFFFIIIIIIIFFFIIIIFIIFIBFFIIIFIIIIFIFIFFIBFFFFFFFFFFFFFFFFFBFBFBFFFFFBBBFBBBBBBBBBFB<'7<BBB<	MC:Z:101M	AS:i:0	XS:i:0


In [91]:
!samtools view -@ 24 $inputs_folder$srr_name-to-$donor_name".bam" | grep SRR5090597.1011981

SRR5090597.1011981	73	NC_001526.4	7571	60	101M	=	7571	0	ATCAAGAACACGTAGAGAAACCCAGCTGTAATCATGCATGGAGATACACCTACATTGCATGAATATATGTTAGATTTGCAACCAGAGACAACTGATCTCTA	BBBFFFFFFFFFFFIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIBFFFFFFIIIIFFIFIFFBFFFFFFBFFFFBBB<BFFBFFBBBB	NM:i:0	MD:Z:101	AS:i:101	XS:i:0
SRR5090597.1011981	133	NC_001526.4	7571	0	*	=	7571	0	TCGCCGTCGCTGAAAACATGGATCATCACTCGAGACAACGATTTCACATCGTCTTCGTTTTTGATGTCCAGTTTCCGAAGCATGCCTGCAGGATCAGCCAT	BBBFFFFFFFFFFIIIIIIIFFFIIIIFIIFIBFFIIIFIIIIFIFIFFIBFFFFFFFFFFFFFFFFFBFBFBFFFFFBBBFBBBBBBBBBFB<'7<BBB<	MC:Z:101M	AS:i:0	XS:i:0


## Checking STAR Alignment and BAM files

In [92]:
srr_name="SRR5090597"
donor_name="hpv16"
recipient_name="USCShg38"
inputs_folder="../workflows/cromwell-final-outputs-star-hpv16/"

In [69]:
!echo $srr_name-to-$donor_name
!echo $srr_name-to-$recipient_name

SRR5090597-to-hpv16
SRR5090597-to-USCShg38


In [70]:
# 3              MM
# 73 and 133     MU
# 69 nd 137      UM
# 77 and 141     UU

In [71]:
!samtools view -@ 24 -f 3 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

83
99
147
163
339
355
403
419


In [72]:
!samtools view -@ 24 -f 3 -F 2048 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

83
99
147
163
339
355
403
419


In [112]:
!samtools view -@ 24 -f 73 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

73
77
89


In [74]:
!samtools view -@ 24 -f 133 $inputs_folder$srr_name-to-$donor_name".bam" | cut -f2 | sort -n | uniq

133
141
165


In [75]:
!samtools view -@ 24 -f 73 $inputs_folder$srr_name-to-$donor_name".bam" | head -n 5

SRR5090597.3928639	73	NC_001526.4	5	255	81M1I19M	*	0	0	CTGATCCTGCAGGTACCAATGGGGAAGAGGGTACGGGATGTAATGGATGGTTTTATGTAGAGGCTGTAGTGGAAAAAAAAAACAGGGGATGCTATATCAGA	BBBFFFFFFFFFFFFIIIIIIIIIIFIFIIIBFIIIIBFFFFIIIIIIIIBFFFIIIFFIFIIFFFFFFBFFFBFBFBFFFF0<BBBF<BBFFFFFFFFFB	NH:i:1	HI:i:1	AS:i:94	nM:i:0	NM:i:1	MD:Z:100	jM:B:c,-1	jI:B:i,-1
SRR5090597.4449673	89	NC_001526.4	57	255	30M	*	0	0	TTATGTAGAGGCTGTAGTGGAAAAAAAAAC	BBFBFBIIFIFFBFBIIFFFFFFFFFFBBB	NH:i:1	HI:i:1	AS:i:29	nM:i:0	NM:i:0	MD:Z:30	jM:B:c,-1	jI:B:i,-1
SRR5090597.3466234	73	NC_001526.4	1396	255	101M	*	0	0	GGCATACCTAAAAAAAATTGCATATTACTATATGGTGCAGCTAACACAGGTAAATCATTATTTGGTATGAGTTTAATGAAATTTCTGCAAGGGTCTGTAAT	BBBFFFFFFFFFFIIIIIIIIIIIIIFIIFIIIIIFFIIIIIIIIIIIIIFFIIIIIIIFIIIIFFFFFFFBBFFFBFFFFFFFFFFFFFFFFBBBBFFFF	NH:i:1	HI:i:1	AS:i:99	nM:i:0	NM:i:0	MD:Z:101	jM:B:c,-1	jI:B:i,-1
SRR5090597.1879149	89	NC_001526.4	2550	255	94M	*	0	0	CCGCGACCCATACCAAAGCCGTCGCCTTGGGCACCGAAGAAACACAGACGACTATCCAGCGACCAAGATCAGAGCCAGACACCGGAAACCCCTG	BBB7BBBBBBBBBBBBB<7<B<<BBBBB

In [76]:
!samtools view -@ 24 -f 133 $inputs_folder$srr_name-to-$donor_name".bam" | head -n 5

SRR5090597.1	141	*	0	0	*	*	0	0	AGAAGATCAAAGACGCCAGGAAAGGTCCCCTGGTACCTTTTCCAAACCAAAAATCTGAAGCAGCAGAACCTCCAAAAACTCCACCCTCATCTTGTGATTCC	7<<BFFFFFFFFFIIIIIIIIIIIIFFIIIIIIFFIIIIIIIIIIIIIIIFFIIIIIIIIIIFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFB	NH:i:0	HI:i:0	AS:i:0	nM:i:0	uT:A:0
SRR5090597.2	141	*	0	0	*	*	0	0	TCTCTCTTCTTCCTGGCATCTCTCTTCATGTTGCTAGGCTCTTGGTGAATTTGTCCTGCCTCTGTTCCTCCACAGCTCCGGCTCCTGTATGAATGCAATCC	BBBFFFFFFFFFFIIIIIIIIIIIFIIIIFFFFIIIIIIFFIFFFBFFFFFFIFFIIFFIIFBFFFBFFFIIIFFFI<<FFFFBBBBBBFF<<<0B<7<B7	NH:i:0	HI:i:0	AS:i:0	nM:i:0	uT:A:0
SRR5090597.3	141	*	0	0	*	*	0	0	CCACAGCCAAGGAGTTTGCAGACTCTCTGGGCATCCCCTTCTTGGAGACGAGCGCCAAGAATGCCACCAATGTCGAGCAGGCGTTCATGACCATGGCTGC	<BBFFFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIFIIIIIIFIIIFFFIFFFFFFFFBBBFFFFFFFFFBFFFBBFFBBBBFFF<<<BBBBBB7	NH:i:0	HI:i:0	AS:i:0	nM:i:0	uT:A:0
SRR5090597.4	141	*	0	0	*	*	0	0	GAGATTTGCAGGGAATTTTTGTGTTTTTGTCAATAGTCATTGACTCTTCAGTGTGACCTGGTGCTGAGAAGAGCTCTTTGAAGTCAACCAGGTCTTCTAGA	BBBFFFFFFFFFFIIIIIIIIFFIIIIIIIIIIIIIIIIIIIIIIIIIIIFIIIIIIII

In [None]:
sam_flag_to_bin(73)

In [None]:
sam_flag_to_bin(77)

In [None]:
sam_flag_decoder(77)

In [None]:
sam_flag_decoder(141)

In [77]:
!samtools view -@ 24 -f 73 -F 4022 $inputs_folder$srr_name-to-$donor_name".bam" | head -n 5

SRR5090597.3928639	73	NC_001526.4	5	255	81M1I19M	*	0	0	CTGATCCTGCAGGTACCAATGGGGAAGAGGGTACGGGATGTAATGGATGGTTTTATGTAGAGGCTGTAGTGGAAAAAAAAAACAGGGGATGCTATATCAGA	BBBFFFFFFFFFFFFIIIIIIIIIIFIFIIIBFIIIIBFFFFIIIIIIIIBFFFIIIFFIFIIFFFFFFBFFFBFBFBFFFF0<BBBF<BBFFFFFFFFFB	NH:i:1	HI:i:1	AS:i:94	nM:i:0	NM:i:1	MD:Z:100	jM:B:c,-1	jI:B:i,-1
SRR5090597.3466234	73	NC_001526.4	1396	255	101M	*	0	0	GGCATACCTAAAAAAAATTGCATATTACTATATGGTGCAGCTAACACAGGTAAATCATTATTTGGTATGAGTTTAATGAAATTTCTGCAAGGGTCTGTAAT	BBBFFFFFFFFFFIIIIIIIIIIIIIFIIFIIIIIFFIIIIIIIIIIIIIFFIIIIIIIFIIIIFFFFFFFBBFFFBFFFFFFFFFFFFFFFFBBBBFFFF	NH:i:1	HI:i:1	AS:i:99	nM:i:0	NM:i:0	MD:Z:101	jM:B:c,-1	jI:B:i,-1
SRR5090597.5471618	73	NC_001526.4	5254	255	41M	*	0	0	TTGCAAACCACCTATAGGGGAACACTGGGGCAAAGGATCCC	<<<BFBBF7BBBFB<FFBB'BB<B7BBBFBFF<F<07B<'7	NH:i:1	HI:i:1	AS:i:40	nM:i:0	NM:i:0	MD:Z:41	jM:B:c,-1	jI:B:i,-1
SRR5090597.2529254	73	NC_001526.4	7151	255	101M	*	0	0	TCAGGACCCACAGGAGCGACCCAGAAAGTTACCACAGTTATGCACAGAGCTGCAAACAACTATACATGATATAATATTAGAATGTGTGTACTGCAAGC

In [78]:
!samtools view -@ 24 -f 133 -F 3962 $inputs_folder$srr_name-to-$donor_name".bam" | head -n 5

SRR5090597.28099	133	*	0	0	*	NC_001526.4	7712	0	GTGTTTCTTCGGTGCCCAAGGCGACGGCTTTGGTATGGGTCGCGGCGGGG	BBBFFFFFFFFFFFFIIIIII<FFIIIIIIIFIBBFIIIFFFFFFFFFFF	NH:i:0	HI:i:0	AS:i:99	nM:i:0	uT:A:4
SRR5090597.34386	133	*	0	0	*	NC_001526.4	7164	0	CTGTGTTTCTTCGGTGCCCAAGGCGACGGCTTTGGTATGGGTCGC	BBBFFFFFFFFFFIFFIIIIFIIIIFFIIIIIIIIIIIIIIFIFF	NH:i:0	HI:i:0	AS:i:99	nM:i:0	uT:A:4
SRR5090597.2841958	133	*	0	0	*	NC_001526.4	7784	0	TGGTGGTGGCGGTG	'<<<<BB<B00B<7	NH:i:0	HI:i:0	AS:i:99	nM:i:0	uT:A:4
SRR5090597.239499	133	*	0	0	*	NC_001526.4	7615	0	CTTCGGTGCCCAAGGCGACGGCTTTGGTATGGGTCGCGGCGGGGGG	BBBFFFFFFFFFFII<FFIIIIIIIIIFFFIIIBBFIFFBFFB<'7	NH:i:0	HI:i:0	AS:i:99	nM:i:0	uT:A:4
SRR5090597.263144	133	*	0	0	*	NC_001526.4	7591	0	CCCGTTTTGTCCTTACGAGAACGTCTGTGATACTTTCTGCTAATG	BBBFFFFFF<F0B<FFFFFFFBFBFFIFF<BBBB<FFFIFFIF0B	NH:i:0	HI:i:0	AS:i:99	nM:i:0	uT:A:4


In [93]:
!samtools view -@ 24 $inputs_folder$srr_name-to-$donor_name".bam" | grep 'SRR5090597.1011981\s' | head -n 5

SRR5090597.1011981	77	*	0	0	*	*	0	0	ATCAAGAACACGTAGAGAAACCCAGCTGTAATCATGCATGGAGATACACCTACATTGCATGAATATATGTTAGATTTGCAACCAGAGACAACTGATCTCTA	BBBFFFFFFFFFFFIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIBFFFFFFIIIIFFIFIFFBFFFFFFBFFFFBBB<BFFBFFBBBB	NH:i:0	HI:i:0	AS:i:106	nM:i:0	uT:A:1
SRR5090597.1011981	141	*	0	0	*	*	0	0	TCGCCGTCGCTGAAAACATGGATCATCACTCGAGACAACGATTTCACATCGTCTTCGTTTTTGATGTCCAGTTTCCGAAGCATGCCTGCAGGATCAGCCAT	BBBFFFFFFFFFFIIIIIIIFFFIIIIFIIFIBFFIIIFIIIIFIFIFFIBFFFFFFFFFFFFFFFFFBFBFBFFFFFBBBFBBBBBBBBBFB<'7<BBB<	NH:i:0	HI:i:0	AS:i:106	nM:i:0	uT:A:1


In [94]:
!samtools view -@ 24 $inputs_folder$srr_name-to-$donor_name"_MU.bam" | grep 'SRR5090597.1011981\s' | head -n 5

In [95]:
!samtools view -@ 24 $inputs_folder$srr_name-to-$donor_name".bam" | grep SRR5090597.1011981

SRR5090597.1011981	77	*	0	0	*	*	0	0	ATCAAGAACACGTAGAGAAACCCAGCTGTAATCATGCATGGAGATACACCTACATTGCATGAATATATGTTAGATTTGCAACCAGAGACAACTGATCTCTA	BBBFFFFFFFFFFFIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIBFFFFFFIIIIFFIFIFFBFFFFFFBFFFFBBB<BFFBFFBBBB	NH:i:0	HI:i:0	AS:i:106	nM:i:0	uT:A:1
SRR5090597.1011981	141	*	0	0	*	*	0	0	TCGCCGTCGCTGAAAACATGGATCATCACTCGAGACAACGATTTCACATCGTCTTCGTTTTTGATGTCCAGTTTCCGAAGCATGCCTGCAGGATCAGCCAT	BBBFFFFFFFFFFIIIIIIIFFFIIIIFIIFIBFFIIIFIIIIFIFIFFIBFFFFFFFFFFFFFFFFFBFBFBFFFFFBBBFBBBBBBBBBFB<'7<BBB<	NH:i:0	HI:i:0	AS:i:106	nM:i:0	uT:A:1


In [108]:
!samtools view -@ 24 -f 73 $inputs_folder$srr_name-to-$donor_name".bam" | grep 'SRR5090597.1011981\s' | head -n 5 

SRR5090597.1011981	77	*	0	0	*	*	0	0	ATCAAGAACACGTAGAGAAACCCAGCTGTAATCATGCATGGAGATACACCTACATTGCATGAATATATGTTAGATTTGCAACCAGAGACAACTGATCTCTA	BBBFFFFFFFFFFFIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIBFFFFFFIIIIFFIFIFFBFFFFFFBFFFFBBB<BFFBFFBBBB	NH:i:0	HI:i:0	AS:i:106	nM:i:0	uT:A:1


In [109]:
!samtools view -@ 24 -f 133 $inputs_folder$srr_name-to-$donor_name".bam" | grep 'SRR5090597.1011981\s' | head -n 5 

SRR5090597.1011981	141	*	0	0	*	*	0	0	TCGCCGTCGCTGAAAACATGGATCATCACTCGAGACAACGATTTCACATCGTCTTCGTTTTTGATGTCCAGTTTCCGAAGCATGCCTGCAGGATCAGCCAT	BBBFFFFFFFFFFIIIIIIIFFFIIIIFIIFIBFFIIIFIIIIFIFIFFIBFFFFFFFFFFFFFFFFFBFBFBFFFFFBBBFBBBBBBBBBFB<'7<BBB<	NH:i:0	HI:i:0	AS:i:106	nM:i:0	uT:A:1


In [110]:
!samtools view -@ 24 -f 73 -F 4022 $inputs_folder$srr_name-to-$donor_name".bam" | grep 'SRR5090597.1011981\s' | head -n 5 

In [111]:
!samtools view -@ 24 -f 133 -F 3962 $inputs_folder$srr_name-to-$donor_name".bam" | grep 'SRR5090597.1011981\s' | head -n 5 

In [None]:
In BWA
SRR5090597.1011981	73    MU
SRR5090597.1011981	133   MU

In STAR
SRR5090597.1011981	77    UU
SRR5090597.1011981	141   UU

In [116]:
sam_flag_decoder(73)

00000100100[1m1[0m  1     read paired - template having multiple segments in sequencing
000001001001  0   
000001001001  0   
00000100[1m1[0m001  8     mate unmapped - next segment in the template unmapped
000001001001  0   
000001001001  0   
00000[1m1[0m001001  64    first in pair - the first segment in the template
000001001001  0   
000001001001  0   
000001001001  0   
000001001001  0   
000001001001  0   


'read paired - template having multiple segments in sequencing + mate unmapped - next segment in the template unmapped + first in pair - the first segment in the template'

In [117]:
sam_flag_decoder(77)

00000100110[1m1[0m  1     read paired - template having multiple segments in sequencing
000001001101  0   
000001001[1m1[0m01  4     read unmapped - segment unmapped
00000100[1m1[0m101  8     mate unmapped - next segment in the template unmapped
000001001101  0   
000001001101  0   
00000[1m1[0m001101  64    first in pair - the first segment in the template
000001001101  0   
000001001101  0   
000001001101  0   
000001001101  0   
000001001101  0   


'read paired - template having multiple segments in sequencing + read unmapped - segment unmapped + mate unmapped - next segment in the template unmapped + first in pair - the first segment in the template'

In [136]:
srr_name="SRR5090597"
donor_name="hpv16"
recipient_name="USCShg38"
inputs_folder="../workflows/cromwell-final-outputs-bwa-hpv16/"

In [137]:
!samtools view -@ 24 -c -f 3 -F 2048 $inputs_folder$srr_name-to-$donor_name".bam" 

9574


In [138]:
!samtools view -@ 24 -c -f 3 -F 2048 $inputs_folder$srr_name-to-$recipient_name".bam" 

[E::hts_open_format] Failed to open file "../workflows/cromwell-final-outputs-bwa-hpv16/SRR5090597-to-USCShg38.bam" : No such file or directory
samtools view: failed to open "../workflows/cromwell-final-outputs-bwa-hpv16/SRR5090597-to-USCShg38.bam" for reading: No such file or directory


In [139]:
srr_name="SRR5090597"
donor_name="hpv16"
recipient_name="USCShg38"
inputs_folder="../workflows/cromwell-final-outputs-star-hpv16/"

In [140]:
!samtools view -@ 24 -c -f 3 -F 2048 $inputs_folder$srr_name-to-$donor_name".bam" 

8738


In [141]:
S!samtools view -@ 24 -c -f 3 -F 2048 $inputs_folder$srr_name-to-$recipient_name".bam" 

SyntaxError: invalid syntax (<ipython-input-141-5a3e7815ed0c>, line 1)