In [13]:
from nbwrapper import getargs
from pprint import pprint
from multiprocessing import Pool, Value

import re
import numpy as np
import pickle
from itertools import repeat, count, product, chain
import sys
import pysam
from math import floor
import os.path

%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/alignment_lib.ipynb"
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/analysis_lib.ipynb"

In [4]:
args = getargs()



In [5]:
## for testing only
# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.2D.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/david_eccles_bc_ideas/mouse_ref.fa",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment",
#     "ncores": 24,
#     "nmer": 5
# }

# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events.2D.62pA.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment.62pA",
#     "ncores": 62
# }

args = {
    "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.events.2D.pickle",
    "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/ecoli_mg1655.fa",
    "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.alignment",
    "models": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1.model.pickle",
    "ncores": 62,
    "nmer": 6
}

# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.events.2D.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/ecoli_mg1655.fa",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.alignment",
#     "ncores": 62,
#     "nmer": 6
# }

In [6]:
NMER = args["nmer"]

In [7]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [8]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["ref"])
assert args["ncores"]

### Prepare File Data

In [9]:
def prepare_file_data(file_obj):
    """normalize events and save generated called_seq (instead of some metrichor magic)"""
    file_obj = correct_read(file_obj)
    called_seq = events2seq(file_obj["events"].to_dict("records"))
    file_obj["fastq"] = "\n".join([file_obj["fastq"].split("\n")[0], called_seq])
    return file_obj

In [10]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [11]:
p = Pool(args["ncores"])
print("Prepare file data: ")
results = []
try:
    for i, res in enumerate(p.imap_unordered(prepare_file_data, file_data), 1):
        results.append(res)
        sys.stdout.write('\rdone {0:%}'.format(i/float(len(file_data))))
    p.close()
    p.join()
except KeyboardInterrupt:
    p.terminate()
file_data = results

Prepare file data: 
done 100.000000%

### Align to reference

In [15]:
ref = load_ref(args["ref"])

['>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome']
AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAAT


In [16]:
fastq_file = "{0}.fastq".format(args["out_basename"])
mk_fastq(fastq_file, file_data)

In [17]:
sam_file = "{0}.sam".format(args["out_basename"])
graphmap(args["ref"], fastq_file, sam_file, args["ncores"])

[Index 14:31:57] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 14:31:57] Index already exists. Loading from file.
[Index 14:31:58] Secondary index already exists. Loading from file.
[Index 14:31:58] Index loaded in 0.82 sec.
[Index 14:31:58] Memory consumption: [currentRSS = 674 MB, peakRSS = 674 MB]

[Run 14:31:58] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 14:31:58] Reference genome is assumed to be linear.
[Run 14:31:58] Only one alignment will be reported per mapped read.
[ProcessReads 14:31:58] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 14:31:58] Batch of 100 reads (0 MiB) loaded in 0.01 sec. (18380008 bases)
[ProcessReads 14:31:58] Memory consumption: [currentRSS = 675 MB, peakRSS = 675 MB]
[ProcessReads 14:31:58] Using 62 threads.
[ProcessReads 14:32:01] [CPU time: 34.14 sec, RSS: 690 MB] Read: 100/100 (100.00%) [m: 100, u: 0]               

In [18]:
prepare_sam(args["out_basename"])

'/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.alignment.sorted.bam'

In [19]:
samfile = pysam.AlignmentFile("{0}.sorted.bam".format(args["out_basename"]))
samreads = {}

In [20]:
print(len(file_data))

100


In [21]:
prepare_filemap(file_data)

In [22]:
reads  = [x for x in samfile.fetch()]
len(reads)

100

In [23]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [24]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [25]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [26]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [27]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [28]:
def process_events(pairs, file_obj):
    event_list = file_obj["events"].to_dict("records")
    called_seq = file_obj["fastq"].split("\n")[1]
    i_seq = 0
    correct = []
    for ev in event_list:
        ev["channel"] = file_obj["channel"]
        ev["file_id"] = file_obj["file_id"]
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)

        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], called_seq)
        assert(read_kmer == ev_kmer), (i_seq, ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, called_seq, ref):
            ev["ref_position"] = pairs[1][ev_index[0]] #first position of kmer in reference
            correct.append(ev)
    return correct

In [29]:
total_events = 0

p = Pool(args["ncores"])

result = []
try:
    for i, read in enumerate(reads):
        file_id, channel_id = get_file_and_channel(read.query_name)
        print(i, file_id, channel_id, read.query_name)
        pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
        file_obj = get_file(channel_id, file_id)
        assert(pairs[0][0] == 0), "alignment is not null-indexed."

        ## map read to events
        total_events += len(file_obj["events"].index)
    #     total_events += len(file_obj["events"])


        result.append(p.apply_async(process_events,[pairs, file_obj]))
#         result.append(process_events(pairs, file_obj))

except KeyboardInterrupt:
    p.terminate()    
    

0 62 101 ch101_file62_read
1 57 101 ch101_file57_read
2 13 100 ch100_file13_read
3 103 102 ch102_file103_read
4 29 100 ch100_file29_read
5 49 101 ch101_file49_read
6 47 100 ch100_file47_read
7 38 100 ch100_file38_read
8 53 100 ch100_file53_read
9 34 100 ch100_file34_read
10 13 101 ch101_file13_read
11 17 101 ch101_file17_read
12 24 100 ch100_file24_read
13 44 101 ch101_file44_read
14 33 100 ch100_file33_read
15 10 101 ch101_file10_read
16 32 101 ch101_file32_read
17 24 101 ch101_file24_read
18 31 100 ch100_file31_read
19 66 100 ch100_file66_read
20 19 101 ch101_file19_read
21 58 101 ch101_file58_read
22 36 101 ch101_file36_read
23 72 100 ch100_file72_read
24 6 101 ch101_file6_read
25 102 102 ch102_file102_read
26 8 101 ch101_file8_read
27 11 101 ch101_file11_read
28 69 100 ch100_file69_read
29 62 100 ch100_file62_read
30 42 101 ch101_file42_read
31 116 102 ch102_file116_read
32 6 100 ch100_file6_read
33 25 101 ch101_file25_read
34 9 100 ch100_file9_read
35 111 102 ch102_file111_read
36

In [30]:
true_events = list(chain.from_iterable([r.get() for r in result]))

In [31]:
len(true_events)

259289

In [32]:
print(total_events)

1176881


In [33]:
print(len(true_events)/total_events)

0.22031879178948424


In [34]:
pickle.dump(true_events,
            open("{0}_true_events.pickle".format(args["out_basename"]), 'wb'))