# Analyse correlations in alignment of reads to human and arabidopsis

This notebook performs an analysis of a pair of full BAM files, one aligned to human genome and one to Arabidopsis.




* Run the bash script before using this notebook.

## Supporting code

In [1]:
import pysam
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Code from "small fragments" -- do we need it here?

In [2]:
def get_read_pairs(bam, want_n_pairs, print_qname=False):
    reads = []
    for count, r in enumerate( bam.fetch()):
        if not r.is_read1 or r.mapping_quality < 20 or not r.is_proper_pair:
            continue
        try:
            m8 = bam.mate(r)
        except ValueError:
            continue
        if m8.mapping_quality < 20 or abs(m8.reference_start - r.reference_start) > 3000:
            continue
        if print_qname:
            print("\n", r.query_name)
        reads.append((r, m8))
        if len(reads) == want_n_pairs:
            break
    return reads

In [3]:
sample = "NEB-100ng-1"
tags = ['arabidopsis', 'human']
bams = [pysam.AlignmentFile(f"data/{sample}-{tag}-qsort.bam") for tag in tags]

In [4]:
def next_valid_read(it):
    for r in it:
        if r.is_read1 and r.is_proper_pair and r.mapping_quality > 20 and not r.is_secondary:
            return r
    return None

In [5]:
# 2-element lists for current iterators and associated data:
# The largest qname should always be in the second element. Will swap the elements if it's not.
iters = [bam.fetch(until_eof=True) for bam in bams]
current_read = [None, next_valid_read(iters[1])]
stats_object = [{'data': tag, 'unique': 0, 'both': 0, 'index':i} for i, tag in enumerate(tags)]
# index above is used to uniquely enter the numpy array for size correlations
MAX_LEN = 1000
size_corr = np.zeros((MAX_LEN, MAX_LEN))

while current_read[1]:
    current_read[0] = next_valid_read(iters[0])
    if not current_read[0]:
        break
    elif current_read[0].query_name > current_read[1].query_name:
        # We found a greater one in the current iterator (which was behind before).
        # Mark the read with qname less, in iterator index 1, as unique to that file.
        stats_object[1]['unique'] += 1
        # Then swap the order.
        stats_object = stats_object[1:] + stats_object[:1]
        iters = iters[1:] + iters[:1]
        current_read = current_read[1:] + current_read[:1]
    elif current_read[0].query_name == current_read[1].query_name:
        lengths = [min(9999, abs(current_read[stats_object[x]['index']].tlen)) for x in [0,1]]
        size_corr[lengths] += 1
        stats_object[0]['both'] += 1
        stats_object[1]['both'] += 1
        current_read[1] = next_valid_read(iters[1])
    else:
        # Our current iterator is still behind. The read we just processed was unique to our file.
        stats_object[0]['unique'] += 1    

In [6]:
stats_object

[{'data': 'human', 'unique': 38131, 'both': 0, 'index': 1},
 {'data': 'arabidopsis', 'unique': 55399, 'both': 0, 'index': 0}]