In [30]:
from nbwrapper import getargs
from pprint import pprint
from multiprocessing import Pool, Value

import re
import numpy as np
import pickle
from itertools import repeat, count, product, chain
import sys
import pysam
from math import floor
import os.path
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/alignment_lib.ipynb"

In [31]:
args = getargs()



In [32]:
## for testing only
# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.2D.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/mouse_ref.fa",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment",
#     "ncores": 24
# }

args = {
    "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events.2D.62pA.pickle",
    "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
    "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment.62pA",
    "ncores": 62
}

In [33]:
NMER = 5

In [34]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [35]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["ref"])
assert args["ncores"]

In [36]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [37]:
ref_file = args["ref"]
test = !cat {ref_file} | grep ">"
print(test)
ref = !cat {ref_file} | grep -v ">"
ref = ref[0]
print(ref[:100])

['>burn-in lambda_ref']
GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACC


In [38]:
fastq_file = "{0}.fastq".format(args["out_basename"])
mk_fastq(fastq_file, file_data)

In [39]:
sam_file = "{0}.sam".format(args["out_basename"])
graphmap(ref_file, fastq_file, sam_file, args["ncores"])

[Index 16:15:19] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 16:15:19] Index already exists. Loading from file.
[Index 16:15:19] Secondary index already exists. Loading from file.
[Index 16:15:19] Index loaded in 0.45 sec.
[Index 16:15:19] Memory consumption: [currentRSS = 516 MB, peakRSS = 9592 MB]

[Run 16:15:19] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 16:15:19] Reference genome is assumed to be linear.
[Run 16:15:19] Only one alignment will be reported per mapped read.
[ProcessReads 16:15:19] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 16:15:19] Batch of 746 reads (8 MiB) loaded in 0.06 sec. (21038248 bases)
[ProcessReads 16:15:19] Memory consumption: [currentRSS = 525 MB, peakRSS = 9592 MB]
[ProcessReads 16:15:19] Using 62 threads.
[ProcessReads 16:15:25] [CPU time: 190.97 sec, RSS: 585 MB] Read: 746/746 (100.00%) [m: 740, u: 6]            

In [40]:
prepare_sam(args["out_basename"])

'/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment.62pA.sorted.bam'

In [41]:
samfile = pysam.AlignmentFile("{0}.sorted.bam".format(args["out_basename"]))
samreads = {}

In [42]:
print(len(file_data))

746


In [43]:
prepare_filemap(file_data)

In [44]:
reads  = [x for x in samfile.fetch()]
len(reads)

740

In [45]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [46]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [47]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [48]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [49]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [50]:
def process_events(pairs, file_obj):
    event_list = file_obj["events"].to_dict("records")
#     event_list = file_obj["events"]
    called_seq = file_obj["fastq"].split("\n")[1]
    i_seq = 0
    correct = []
    for ev in event_list:
        ev["channel"] = file_obj["channel"]
        ev["file_id"] = file_obj["file_id"]
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)

        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], called_seq)       
        assert(read_kmer == ev_kmer), (ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, called_seq, ref):
            ev["ref_position"] = pairs[1][ev_index[floor(NMER/2)]] #first position of kmer in reference
            correct.append(ev)
    return correct

In [51]:
total_events = 0

p = Pool(args["ncores"])

result = []
try:
    for i, read in enumerate(reads):
        file_id, channel_id = get_file_and_channel(read.query_name)
        print(i, file_id, channel_id, read.query_name)
        pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
        file_obj = get_file(channel_id, file_id)
        assert(pairs[0][0] == 0), "alignment is not null-indexed."

        ## map read to events
        total_events += len(file_obj["events"].index)
    #     total_events += len(file_obj["events"])


        result.append(p.apply_async(process_events,
                                         [pairs, file_obj]))

except KeyboardInterrupt:
    p.terminate()    
    

0 9 180 ch180_file9_read
1 15 179 ch179_file15_read
2 22 19 ch19_file22_read
3 19 266 ch266_file19_read
4 0 237 ch237_file0_read
5 15 355 ch355_file15_read
6 2 358 ch358_file2_read
7 4 395 ch395_file4_read
8 0 36 ch36_file0_read
9 19 413 ch413_file19_read
10 31 452 ch452_file31_read
11 22 5 ch5_file22_read
12 22 283 ch283_file22_read
13 19 143 ch143_file19_read
14 4 195 ch195_file4_read
15 9 18 ch18_file9_read
16 3 18 ch18_file3_read
17 17 34 ch34_file17_read
18 2 447 ch447_file2_read
19 0 508 ch508_file0_read
20 21 53 ch53_file21_read
21 24 106 ch106_file24_read
22 8 173 ch173_file8_read
23 10 195 ch195_file10_read
24 17 266 ch266_file17_read
25 36 436 ch436_file36_read
26 14 182 ch182_file14_read
27 13 283 ch283_file13_read
28 3 422 ch422_file3_read
29 26 13 ch13_file26_read
30 12 179 ch179_file12_read
31 30 441 ch441_file30_read
32 10 296 ch296_file10_read
33 3 430 ch430_file3_read
34 19 428 ch428_file19_read
35 20 310 ch310_file20_read
36 0 376 ch376_file0_read
37 14 447 ch447_file

In [52]:
true_events = list(chain.from_iterable([r.get() for r in result]))

In [53]:
len(true_events)

1336755

In [54]:
print(total_events)

6321920


In [55]:
print(len(true_events)/total_events)

0.2114476298339745


In [56]:
pickle.dump(true_events,
            open("{0}_true_events.pickle".format(args["out_basename"]), 'wb'))