In [1]:
%pip install pysam
%pip install matplotlib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path
import pysam
file = Path("../../../results/aligned/GCA_024222315.1_ASM2422231v1_genomic_aln_best_1_perc.sorted.bam")

In [3]:
import numpy as np
import pandas as pd
accession_list_chr = [
    "CM044164.1", "CM044165.1", "CM044166.1", "CM044167.1",
    "CM044168.1", "CM044169.1", "CM044170.1", "CM044171.1",
    "CM044172.1", "CM044173.1", "CM044174.1", "CM044175.1",
    "CM044176.1"
]

First, we create the index:

In [4]:
pysam.index(str(file))

''

Now, we open the bam file:

In [5]:
samfile = pysam.AlignmentFile(file, "rb")

## Number of mapped and unmapped reads

In [6]:
print(f"There are {samfile.mapped:,.0f} mapped reads")
print(f"There are {samfile.unmapped:,.0f} unmapped reads")
print(f"{samfile.mapped/(samfile.unmapped + samfile.mapped)*100:,.2f}% were mapped")

There are 87,782 mapped reads
There are 254 unmapped reads
99.71% were mapped


There are the following number of reads:

In [7]:
(samfile.mapped + samfile.unmapped) 

88036

In [8]:
print(f"There are {samfile.nreferences} contigs")
print(f"There are {sum(samfile.lengths[samfile.get_tid(chr)] for chr in accession_list_chr):,.0f} nucleotides in chromosomes in the reference genome")

There are 478 contigs
There are 488,761,465 nucleotides in chromosomes in the reference genome


Now, we are going to create 2 csv tables for summarize coverage and mapped reads to chr 

In [9]:
from functools import reduce
def count_coverage(samfile: pysam.AlignmentFile, contig: str)-> int:
    counts_per_each_base = samfile.count_coverage(
        contig=contig, quality_threshold=0
    )
    add_ = lambda x, y: np.add(x, y, dtype='int64')
    return reduce(add_, counts_per_each_base)

In [10]:
coverages_per_base = [count_coverage(samfile, chr) for chr in accession_list_chr]
coverages_per_base


[array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 1, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0])]

In [11]:
contigs_stats = samfile.get_index_statistics()
df = pd.DataFrame({
    'contig': samfile.references[0:13],
    'chr_length': samfile.lengths[0:13],
    'mapped_reads': [contig.mapped for contig in contigs_stats[0:13]],
    'reads_length': [[x.infer_read_length() for x in samfile.fetch(chr)] for chr in accession_list_chr],
    'unmapped_reads': [contig.unmapped for contig in contigs_stats[0:13]],
    'coverage': [np.sum(x) for x in coverages_per_base],
    })
df.to_csv('chr_contig_data.csv', index=None)
df

Unnamed: 0,contig,chr_length,mapped_reads,reads_length,unmapped_reads,coverage
0,CM044164.1,36629419,5902,"[21384, 23253, 20809, 20225, 24472, 21384, 197...",0,10441348
1,CM044165.1,34679789,5752,"[19591, 22495, 20545, 21646, 20118, 20815, 208...",0,12501724
2,CM044166.1,43944365,7277,"[22255, 21455, 21455, 21455, 22255, 20049, 234...",0,16900645
3,CM044167.1,49018360,9057,"[21698, 19937, 20739, 20725, 22160, 22433, 196...",0,19402431
4,CM044168.1,43740154,7080,"[22914, 22914, 22056, 22056, 19630, 22056, 221...",0,16012009
5,CM044169.1,34491633,6553,"[22590, 19854, 21286, 21084, 22450, 23008, 230...",0,13721077
6,CM044170.1,34851334,5783,"[21762, 18980, 23979, 18980, 21203, 22866, 219...",0,11689792
7,CM044171.1,40183539,5982,"[21157, 24463, 24463, 20087, 19993, 19993, 200...",0,12387135
8,CM044172.1,31440434,5011,"[21575, 19938, 19474, 22137, 22009, 20640, 190...",0,10437369
9,CM044173.1,33596851,5826,"[23328, 21844, 21310, 21763, 19424, 20485, 195...",0,11830485


In [12]:
df2 = pd.DataFrame({
    'contig': np.concatenate([(np.amax(x)+1)*[chr] for x, chr in zip(coverages_per_base, accession_list_chr)]),
    'coverage_bin': np.concatenate([list(range(0, np.amax(x)+1)) for x in coverages_per_base]),
    'counts': np.concatenate([np.bincount(x) for x in coverages_per_base])
    })
df2.to_csv('chr_bin_count_coverage.csv', index=None)
df2

Unnamed: 0,contig,coverage_bin,counts
0,CM044164.1,0,30121645
1,CM044164.1,1,4712781
2,CM044164.1,2,1165716
3,CM044164.1,3,315508
4,CM044164.1,4,129895
...,...,...,...
1102,CM044176.1,66,627
1103,CM044176.1,67,914
1104,CM044176.1,68,961
1105,CM044176.1,69,665
