In [71]:
from nbwrapper import getargs
from pprint import pprint
from multiprocessing import Pool, Value

import re
import numpy as np
import pickle
from itertools import repeat, count, product, chain
import sys
import pysam
from math import floor
import os.path
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/alignment_lib.ipynb"

In [72]:
args = getargs()



In [73]:
## for testing only
args = {
    "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.2D.pickle",
    "ref": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/mouse_ref.fa",
    "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment",
    "ncores": 24
}

# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events.2D.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment",
#     "ncores": 62
# }

In [74]:
NMER = 5

In [75]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [76]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["ref"])
assert args["ncores"]

In [77]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [78]:
ref_file = args["ref"]
test = !cat {ref_file} | grep ">"
print(test)
ref = !cat {ref_file} | grep -v ">"
ref = ref[0]
print(ref[:100])

['>burn-in lambda_ref']
GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACC


In [79]:
fastq_file = "{0}.fastq".format(args["out_basename"])
mk_fastq(fastq_file, file_data)

In [80]:
sam_file = "{0}.sam".format(args["out_basename"])
graphmap(ref_file, fastq_file, sam_file, args["ncores"])

[Index 14:40:03] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 14:40:03] Index is not prebuilt. Generating index.
[LoadOrGenerate 14:40:03] Started generating new index from file '/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta'...
[LoadOrGenerate 14:40:03] Storing new index to file '/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta.gmidx'...
[LoadOrGenerate 14:40:05] New index stored.
[Index 14:40:05] Secondary index is not prebuilt. Generating index.
[LoadOrGenerate 14:40:05] Started generating new index from file '/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta'...
[LoadOrGenerate 14:40:05] Storing new index to file '/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta.gmidxsec'...
[LoadOrGenerate 14:40:07] New index stored.
[Index 14:40:07] Index loaded in 1.24 sec.
[Index 14:40:07] Memory consumption: [currentRSS = 484 MB, peak

In [81]:
prepare_sam(args["out_basename"])

'/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment.sorted.bam'

In [82]:
samfile = pysam.AlignmentFile("{0}.sorted.bam".format(args["out_basename"]))
samreads = {}

In [83]:
print(len(file_data))

2872


In [84]:
prepare_filemap(file_data)

In [85]:
reads  = [x for x in samfile.fetch()]
len(reads)

2846

In [86]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [87]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [88]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [89]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [90]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [91]:
def process_events(pairs, file_obj):
    event_list = file_obj["events"].to_dict("records")
#     event_list = file_obj["events"]
    called_seq = file_obj["fastq"].split("\n")[1]
    i_seq = 0
    correct = []
    for ev in event_list:
        ev["channel"] = file_obj["channel"]
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)

        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], called_seq)       
        assert(read_kmer == ev_kmer), (ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, called_seq, ref):
            ev["ref_position"] = pairs[1][ev_index[floor(NMER/2)]] #first position of kmer in reference
            correct.append(ev)
    return correct

In [92]:
total_events = 0

p = Pool(args["ncores"])

result = []
try:
    for i, read in enumerate(reads):
        file_id, channel_id = get_file_and_channel(read.query_name)
        print(i, file_id, channel_id, read.query_name)
        pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
        file_obj = get_file(channel_id, file_id)
        assert(pairs[0][0] == 0), "alignment is not null-indexed."

        ## map read to events
        total_events += len(file_obj["events"].index)
    #     total_events += len(file_obj["events"])


        result.append(p.apply_async(process_events,
                                         [pairs, file_obj]))

except KeyboardInterrupt:
    p.terminate()    
    

0 5 108 ch108_file5_read
1 17 105 ch105_file17_read
2 6 132 ch132_file6_read
3 1 132 ch132_file1_read
4 15 179 ch179_file15_read
5 6 154 ch154_file6_read
6 9 180 ch180_file9_read
7 14 14 ch14_file14_read
8 6 192 ch192_file6_read
9 22 19 ch19_file22_read
10 5 165 ch165_file5_read
11 6 204 ch204_file6_read
12 28 20 ch20_file28_read
13 2 202 ch202_file2_read
14 10 214 ch214_file10_read
15 6 200 ch200_file6_read
16 13 21 ch21_file13_read
17 0 200 ch200_file0_read
18 18 21 ch21_file18_read
19 11 246 ch246_file11_read
20 24 231 ch231_file24_read
21 1 247 ch247_file1_read
22 0 237 ch237_file0_read
23 2 244 ch244_file2_read
24 6 254 ch254_file6_read
25 1 252 ch252_file1_read
26 19 266 ch266_file19_read
27 15 268 ch268_file15_read
28 1 282 ch282_file1_read
29 17 27 ch27_file17_read
30 1 300 ch300_file1_read
31 10 29 ch29_file10_read
32 7 304 ch304_file7_read
33 17 31 ch31_file17_read
34 13 323 ch323_file13_read
35 28 323 ch323_file28_read
36 8 33 ch33_file8_read
37 13 34 ch34_file13_read
38 15 

In [93]:
true_events = list(chain.from_iterable([r.get() for r in result]))

In [94]:
len(true_events)

4622762

In [95]:
print(total_events)

23422213


In [96]:
print(len(true_events)/total_events)

0.1973665767619823


In [97]:
pickle.dump(true_events,
            open("{0}_true_events.pickle".format(args["out_basename"]), 'wb'))