In [None]:
from pathlib import Path
import pysam


In [None]:
infiles = [Path(infile) for infile in snakemake.input]
perc5 = str(snakemake.output[0])
perc1 = str(snakemake.output[1])

In [None]:
import numpy as np
import pandas as pd
accession_list_chr = [
    "CM044164.1", "CM044165.1", "CM044166.1", "CM044167.1",
    "CM044168.1", "CM044169.1", "CM044170.1", "CM044171.1",
    "CM044172.1", "CM044173.1", "CM044174.1", "CM044175.1",
    "CM044176.1"
]

First, we create the index:

In [None]:
for infile in infiles:
    pysam.index(str(infile))

Now, we open the bam file:

In [None]:
samfiles = [pysam.AlignmentFile(infile, "rb") for infile in infiles]

In [None]:
from functools import reduce
from itertools import chain
def count_coverage(samfile: pysam.AlignmentFile, contig: str)-> int:
    counts_per_each_base = samfile.count_coverage(
        contig=contig, quality_threshold=0
    )
    add_ = lambda x, y: np.add(x, y, dtype='int64')
    return reduce(add_, counts_per_each_base)

def zero_runs(a):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges

def summarise_big_gaps(samfile):
    intervals = [zero_runs(count_coverage(samfile, chr)) for chr in accession_list_chr] 
    return pd.DataFrame({
        'contig': chain.from_iterable(i*[name] for i, name in zip([len(x) for x in intervals], samfile.references[0:13])),
        'start': chain.from_iterable([x[0] for x in interval] for interval in intervals),
        'end': chain.from_iterable([x[1] for x in interval] for interval in intervals)
        })


In [None]:
df = summarise_big_gaps(samfiles[0])
df
df.to_csv(perc5)

In [None]:
df = summarise_big_gaps(samfiles[1])
df
df.to_csv(perc1)