In [28]:
from nbwrapper import getargs
from pprint import pprint
from multiprocessing import Pool, Value

import re
import numpy as np
import pickle
from itertools import repeat, count, product, chain
import sys
import pysam
import os.path
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/alignment_lib.ipynb"

In [29]:
args = getargs()



In [30]:
## for testing only
args = {
    "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.2D.pickle",
    "ref": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/mouse_ref.fa",
    "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment",
    "ncores": 24
}

# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment",
#     "ncores": 24
# }

In [31]:
NMER = 5

In [32]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [33]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["ref"])
assert args["ncores"]

In [34]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [35]:
ref_file = args["ref"]
test = !cat {ref_file} | grep ">"
print(test)
ref = !cat {ref_file} | grep -v ">"
ref = ref[0]
print(ref[:100])

['>mmusMT_PCR1']
GTTAATGTAGCTTAATAACAAAGCAAAGCACTGAAAATGCTTAGATGGATAATTGTATCCCATAAACACAAAGGTTTGGTCCTGGCCTTATAATTAATTA


In [36]:
fastq_file = "{0}.fastq".format(args["out_basename"])
mk_fastq(fastq_file, file_data)

In [37]:
sam_file = "{0}.sam".format(args["out_basename"])
graphmap(ref_file, fastq_file, sam_file, args["ncores"])

[Index 13:07:06] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 13:07:06] Index already exists. Loading from file.
[Index 13:07:07] Secondary index already exists. Loading from file.
[Index 13:07:07] Index loaded in 0.49 sec.
[Index 13:07:07] Memory consumption: [currentRSS = 515 MB, peakRSS = 515 MB]

[Run 13:07:07] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 13:07:07] Reference genome is assumed to be linear.
[Run 13:07:07] Only one alignment will be reported per mapped read.
[ProcessReads 13:07:07] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 13:07:07] Batch of 51 reads (0 MiB) loaded in 0.00 sec. (27526296 bases)
[ProcessReads 13:07:07] Memory consumption: [currentRSS = 516 MB, peakRSS = 516 MB]
[ProcessReads 13:07:07] Using 24 threads.
[ProcessReads 13:07:07] [CPU time: 4.71 sec, RSS: 520 MB] Read: 51/51 (100.00%) [m: 51, u: 0]                    

In [38]:
prepare_sam(args["out_basename"])

[samopen] SAM header is present: 1 sequences.


'/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment.sorted.bam'

In [39]:
samfile = pysam.AlignmentFile("{0}.sorted.bam".format(args["out_basename"]))
samreads = {}

In [40]:
print(len(file_data))

51


In [41]:
prepare_filemap(file_data)

In [42]:
reads  = [x for x in samfile.fetch()]
len(reads)

51

In [43]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [44]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [45]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [46]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [47]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [48]:
def process_events(pairs, file_obj):
    event_list = file_obj["events"].to_dict("records")
#     event_list = file_obj["events"]
    called_seq = file_obj["fastq"].split("\n")[1]
    i_seq = 0
    correct = []
    for ev in event_list:
        ev["channel"] = file_obj["channel"]
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)

        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], called_seq)       
        assert(read_kmer == ev_kmer), (ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, called_seq, ref):
            correct.append(ev)
    return correct

In [49]:
total_events = 0

p = Pool(args["ncores"])

result = []
try:
    for i, read in enumerate(reads):
        file_id, channel_id = get_file_and_channel(read.query_name)
        print(i, file_id, channel_id, read.query_name)
        pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
        file_obj = get_file(channel_id, file_id)
        assert(pairs[0][0] == 0), "alignment is not null-indexed."

        ## map read to events
        total_events += len(file_obj["events"].index)
    #     total_events += len(file_obj["events"])


        result.append(p.apply_async(process_events,
                                         [pairs, file_obj]))

except KeyboardInterrupt:
    p.terminate()    
    

0 6 156 ch156_file6_read
1 1 157 ch157_file1_read
2 19 135 ch135_file19_read
3 5 204 ch204_file5_read
4 0 135 ch135_file0_read
5 10 206 ch206_file10_read
6 14 209 ch209_file14_read
7 3 211 ch211_file3_read
8 27 215 ch215_file27_read
9 15 215 ch215_file15_read
10 9 215 ch215_file9_read
11 38 132 ch132_file38_read
12 8 141 ch141_file8_read
13 0 157 ch157_file0_read
14 1 201 ch201_file1_read
15 1 203 ch203_file1_read
16 28 211 ch211_file28_read
17 11 223 ch223_file11_read
18 5 227 ch227_file5_read
19 18 142 ch142_file18_read
20 35 132 ch132_file35_read
21 20 211 ch211_file20_read
22 26 132 ch132_file26_read
23 11 156 ch156_file11_read
24 11 133 ch133_file11_read
25 2 203 ch203_file2_read
26 10 132 ch132_file10_read
27 12 211 ch211_file12_read
28 27 212 ch212_file27_read
29 23 135 ch135_file23_read
30 17 132 ch132_file17_read
31 12 141 ch141_file12_read
32 20 206 ch206_file20_read
33 3 142 ch142_file3_read
34 36 132 ch132_file36_read
35 17 211 ch211_file17_read
36 44 132 ch132_file44_read


In [50]:
true_events = list(chain.from_iterable([r.get() for r in result]))

In [51]:
len(true_events)

66466

In [52]:
print(total_events)

405113


In [53]:
print(len(true_events)/total_events)

0.16406780330426327


In [54]:
pickle.dump(true_events,
            open("{0}_true_events.pickle".format(args["out_basename"]), 'wb'))