In [35]:
from pathlib import Path
import pysam
import numpy as np
import os
import pandas as pd

First, we read the BAM file:

In [2]:
infile = "../../results/aligned/chr_aln_best_5_perc.sorted.bam"
samfile = pysam.AlignmentFile(infile, "rb")

And define our functions:

In [3]:
def calculate_window_coverage(mat, window_size):
    return np.add.reduceat(
        np.add.reduceat(mat, np.arange(0, mat.shape[0], 4), axis=0),
        np.arange(0, mat.shape[1], window_size), axis=1)
def calculate_norm_frequency(mat, window_size):
    normalized_mat = np.square(
        np.divide(mat, calculate_window_coverage(mat, 1))
    )
    return calculate_window_coverage(normalized_mat, window_size)

Let's inpect with a toy example:

In [5]:
fake_mat = np.matrix(
    [[12, 6, 0], [0, 0, 0], [0, 0, 0], [1, 5, 0]]
)
print(calculate_window_coverage(fake_mat, 1))
print(calculate_window_coverage(fake_mat, 4))

[[13 11  0]]
[[24]]


In [6]:
print(calculate_norm_frequency(fake_mat + 1, 1))
print(calculate_norm_frequency(fake_mat + 1, 4))

[[0.60553633 0.38666667 0.25      ]]
[[1.242203]]


## Window length 1000

Now, we need to decide a window length: 

In [7]:
accession_list_chr = [
    "CM044164.1", "CM044165.1", "CM044166.1", "CM044167.1",
    "CM044168.1", "CM044169.1", "CM044170.1", "CM044171.1",
    "CM044172.1", "CM044173.1", "CM044174.1", "CM044175.1",
    "CM044176.1"
]

In [36]:
window_size = 1000
output_path = "windows_exp_1000.csv"
for chr in accession_list_chr:
    print(f"Starting with  {chr}")
    coverage = samfile.count_coverage(contig=chr, quality_threshold=0)
    coverage_mat = np.matrix([coverage[0], coverage[1], coverage[2], coverage[3]])
    data = {
        'window_coverage': calculate_window_coverage(coverage_mat, window_size).tolist()[0],
        'window_freq': calculate_norm_frequency(coverage_mat + 1, window_size).tolist()[0]
    }
    df = pd.DataFrame(data)
    df['contig'] = chr
    df.to_csv(output_path, mode='a', header=not os.path.exists(output_path))


Starting with  CM044164.1
Starting with  CM044165.1
Starting with  CM044166.1
Starting with  CM044167.1
Starting with  CM044168.1
Starting with  CM044169.1
Starting with  CM044170.1
Starting with  CM044171.1
Starting with  CM044172.1
Starting with  CM044173.1
Starting with  CM044174.1
Starting with  CM044175.1
Starting with  CM044176.1
