In [17]:
import numpy as np
import pysam
from tempfile import NamedTemporaryFile
import itertools
import math
SIGNIFICANCE = 0.05

In [18]:
from multiprocessing import Pool
import pysam
import pandas
from IPython.display import display

In [19]:
def readstats(args):
        """statistics for a single read. 
        
        Args: 
            read: pysam.AlignedRead
        """
        samfile_path, read_id, total_reads, ref_seq, chunk = args
        
        samfile = pysam.AlignmentFile(samfile_path)
        generator = samfile.fetch()
        read = None
        out = []
        for i in range(read_id):
            #skip previous reads
            read = generator.next()
        for i in range(chunk):
            try:
                read = generator.next()
            except StopIteration:
                break
            s = {
                "read_len": read.get_tag("ZQ"),
                "mapping_quality": read.mapping_quality, 
                "aln_len": read.alen,
                "aln_score": read.get_tag("AS"), 
                "aln_evalue": read.get_tag("ZE"),
                "aln_editdistance": read.get_tag("NM"),
                "mapped_nts": sum([l for op, l in read.cigartuples if op == 0]),
                "ins": sum(l for op, l in read.cigartuples if op == 1),
                "del": sum(l for op, l in read.cigartuples if op == 2),
                "subst": sum(1 for query, ref in read.get_aligned_pairs(matches_only=True) 
                             if read.seq[query] != ref_seq[ref]),
                "is_significant": e2p(read.get_tag("ZE")) < SIGNIFICANCE / total_reads
            }
            assert s["read_len"] >= s["mapped_nts"]
            assert s["aln_len"] >= s["mapped_nts"]
            assert len(read.query_sequence) == s["read_len"]
            out.append(s)
        return out       

class samstats:
    """generate statistics for a samfile"""
    def __init__(self, samfile_name, ref_seq, ncores=4): 
        """
        Args:
            samfile: path to sorted BAM file
            ref_seq: (str) reference sequence
        """
        samfile = pysam.AlignmentFile(samfile_name)
        self.ref_seq = ref_seq
        total_reads = !samtools view {samfile_name} | wc -l #for some reason, samfile.unmapped does not work. 
        total_nts = !samtools view {samfile_name} | cut -f 17 | cut -d":" -f3
        total_reads = int(total_reads[0])
        total_nts =  sum(int(x) for x in total_nts)
        mapped_reads = samfile.mapped
        self.file_stats = {
            "total_reads": total_reads,
            "mapped_reads": mapped_reads, 
            "total_nts": total_nts
        }
        chunk = int(math.ceil(mapped_reads/float(ncores)))
        p = Pool(ncores)
        args = [(samfile_name, i, total_reads, ref_seq, chunk) for i in range(0, mapped_reads, chunk)]
        try:
            read_stats = p.map(readstats, args)  
            p.close()
        except KeyboardInterrupt: 
            p.terminate()
        
        self.read_stats = list(itertools.chain(*read_stats))
            
    
    def sumstat(self, stat): 
        """sum of the stat <stat> for all reads."""
        return sum(r[stat] for r in self.read_stats)
        
    def print_summary(self): 
        lines = [
            ["mapped_reads/total_reads", self.file_stats["mapped_reads"], self.file_stats["total_reads"]],
            ["significant_reads/total_reads", self.sumstat("is_significant"), self.file_stats["total_reads"]],
            ["mapped_nts/total_nts", self.sumstat("mapped_nts"), self.file_stats["total_nts"]],
            ["editdistance/alignment_length", self.sumstat("aln_editdistance"), self.sumstat("aln_len")],
            ["alignment_score/alignment_length", self.sumstat("aln_score"), self.sumstat("aln_len")],
            ["SNPs/mapped_nts", self.sumstat("subst"), self.sumstat("mapped_nts")],
            ["ins/mapped_nts", self.sumstat("ins"), self.sumstat("mapped_nts")],
            ["del/mapped_nts", self.sumstat("del"), self.sumstat("mapped_nts")],            
        ]
        def process_line(line): 
            return [str(x) for x in (line + ["{0:%}".format(float(line[1])/line[2])])]
        return [process_line(line) for line in lines]

In [20]:
def mk_consensus(bam_file, ref_file): 
    """calculate the consensus sequence of a bam file"""
    cons_fq = !samtools mpileup -uf {ref_file} {bam_file} | \
        bcftools view -cg - | \
        /home/ibis/gregor.sturm/nanopore/tools/bcftools/vcfutils.pl vcf2fq
    i = [i for i, line in enumerate(cons_fq) if line[0] == "@"][0]
    return cons_fq, i

In [21]:
def needle(ref_seq, target_seq):    
    """needleman-wunsch global alignment of two sequences"""
    with NamedTemporaryFile('w') as ref_file:
        with NamedTemporaryFile('w') as target_file: 
            with NamedTemporaryFile('r+') as output_file: 
                ref_file.write(ref_seq)
                target_file.write(target_seq)
                target_file.flush()
                ref_file.flush()
                !needle -asequence {ref_file.name} -bsequence {target_file.name} -aformat score -outfile {output_file.name}
                out = output_file.readlines()
    
    return out

In [22]:
def e2p(e): 
    """convert Evalue to Pvalue (of Alignment)"""
    return 1-np.exp(-e)

In [23]:
# ref_file = "../../../david_eccles_bc_ideas/mouse_ref.fa"
# test = !cat {ref_file} | grep ">"
# print(test)
# ref = !cat {ref_file} | grep -v ">"
# ref = ref[0]
# print(ref[:100])

['>mmusMT_PCR1']
GTTAATGTAGCTTAATAACAAAGCAAAGCACTGAAAATGCTTAGATGGATAATTGTATCCCATAAACACAAAGGTTTGGTCCTGGCCTTATAATTAATTA


In [24]:
# sst = samstats("./david_calling.called.sorted.bam", ref)
# display(pandas.DataFrame(sst.print_summary()))

Unnamed: 0,0,1,2,3
0,mapped_reads/total_reads,72,81,88.888889%
1,significant_reads/total_reads,32,81,39.506173%
2,mapped_nts/total_nts,315365,443207,71.155239%
3,editdistance/alignment_length,191698,333197,57.532931%
4,alignment_score/alignment_length,1030,333197,0.309126%
5,SNPs/mapped_nts,83649,315365,26.524503%
6,ins/mapped_nts,90166,315365,28.590998%
7,del/mapped_nts,17832,315365,5.654400%
