In [2]:
from nbwrapper import getargs
from pprint import pprint
from multiprocessing import Pool, Value

import re
import numpy as np
import pickle
from itertools import repeat, count, product, chain
import sys
import pysam
from math import floor
import os.path
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/alignment_lib.ipynb"

In [3]:
args = getargs()



In [4]:
## for testing only
# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.2D.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/mouse_ref.fa",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment",
#     "ncores": 24
# }

args = {
    "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events.template.pickle",
    "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
    "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment_template",
    "ncores": 62
}

In [5]:
NMER = 5

In [6]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [7]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["ref"])
assert args["ncores"]

In [8]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [9]:
ref_file = args["ref"]
test = !cat {ref_file} | grep ">"
print(test)
ref = !cat {ref_file} | grep -v ">"
ref = ref[0]
print(ref[:100])

['>burn-in lambda_ref']
GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACC


In [10]:
fastq_file = "{0}.fastq".format(args["out_basename"])
mk_fastq(fastq_file, file_data)

In [11]:
sam_file = "{0}.sam".format(args["out_basename"])
graphmap(ref_file, fastq_file, sam_file, args["ncores"])

[Index 10:24:18] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 10:24:18] Index already exists. Loading from file.
[Index 10:24:19] Secondary index already exists. Loading from file.
[Index 10:24:19] Index loaded in 0.56 sec.
[Index 10:24:19] Memory consumption: [currentRSS = 516 MB, peakRSS = 6234 MB]

[Run 10:24:19] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 10:24:19] Reference genome is assumed to be linear.
[Run 10:24:19] Only one alignment will be reported per mapped read.
[ProcessReads 10:24:19] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 10:24:19] Batch of 6123 reads (64 MiB) loaded in 0.47 sec. (15959224 bases)
[ProcessReads 10:24:19] Memory consumption: [currentRSS = 582 MB, peakRSS = 6234 MB]
[ProcessReads 10:24:19] Using 62 threads.
[ProcessReads 10:25:17] [CPU time: 1953.24 sec, RSS: 642 MB] Read: 6123/6123 (100.00%) [m: 5709, u: 414]    

In [12]:
prepare_sam(args["out_basename"])

'/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment_template.sorted.bam'

In [13]:
samfile = pysam.AlignmentFile("{0}.sorted.bam".format(args["out_basename"]))
samreads = {}

In [14]:
print(len(file_data))

6123


In [15]:
prepare_filemap(file_data)

In [16]:
reads  = [x for x in samfile.fetch()]
len(reads)

5709

In [17]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [18]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [19]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [20]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [21]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [22]:
def process_events(pairs, file_obj):
    event_list = file_obj["events"].to_dict("records")
#     event_list = file_obj["events"]
    called_seq = file_obj["fastq"].split("\n")[1]
    i_seq = 0
    correct = []
    for ev in event_list:
        ev["channel"] = file_obj["channel"]
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)

        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], called_seq)       
        assert(read_kmer == ev_kmer), (ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, called_seq, ref):
            ev["ref_position"] = pairs[1][ev_index[floor(NMER/2)]] #first position of kmer in reference
            correct.append(ev)
    return correct

In [23]:
total_events = 0

p = Pool(args["ncores"])

result = []
try:
    for i, read in enumerate(reads):
        file_id, channel_id = get_file_and_channel(read.query_name)
        print(i, file_id, channel_id, read.query_name)
        pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
        file_obj = get_file(channel_id, file_id)
        assert(pairs[0][0] == 0), "alignment is not null-indexed."

        ## map read to events
        total_events += len(file_obj["events"].index)
    #     total_events += len(file_obj["events"])


        result.append(p.apply_async(process_events,
                                         [pairs, file_obj]))

except KeyboardInterrupt:
    p.terminate()    
    

0 16 107 ch107_file16_read
1 24 106 ch106_file24_read
2 6 132 ch132_file6_read
3 1 132 ch132_file1_read
4 6 154 ch154_file6_read
5 15 179 ch179_file15_read
6 9 180 ch180_file9_read
7 18 182 ch182_file18_read
8 5 165 ch165_file5_read
9 20 180 ch180_file20_read
10 10 214 ch214_file10_read
11 7 23 ch23_file7_read
12 1 247 ch247_file1_read
13 6 254 ch254_file6_read
14 1 26 ch26_file1_read
15 17 27 ch27_file17_read
16 27 280 ch280_file27_read
17 1 282 ch282_file1_read
18 10 296 ch296_file10_read
19 10 29 ch29_file10_read
20 1 300 ch300_file1_read
21 20 29 ch29_file20_read
22 13 283 ch283_file13_read
23 4 30 ch30_file4_read
24 7 304 ch304_file7_read
25 22 283 ch283_file22_read
26 5 297 ch297_file5_read
27 0 323 ch323_file0_read
28 0 320 ch320_file0_read
29 17 31 ch31_file17_read
30 13 323 ch323_file13_read
31 24 333 ch333_file24_read
32 4 333 ch333_file4_read
33 8 33 ch33_file8_read
34 51 33 ch33_file51_read
35 23 355 ch355_file23_read
36 15 355 ch355_file15_read
37 3 356 ch356_file3_read
38

In [24]:
true_events = list(chain.from_iterable([r.get() for r in result]))

In [25]:
len(true_events)

2166596

In [26]:
print(total_events)

35868734


In [27]:
print(len(true_events)/total_events)

0.060403470052776326


In [28]:
pickle.dump(true_events,
            open("{0}_true_events.pickle".format(args["out_basename"]), 'wb'))