In [10]:
from nbwrapper import getargs
from pprint import pprint
from multiprocessing import Pool, Value

import re
import numpy as np
import pickle
from itertools import repeat, count, product, chain
import sys
import pysam
from math import floor
import os.path
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/alignment_lib.ipynb"
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/analysis_lib.ipynb"

In [11]:
args = getargs()



In [12]:
## for testing only
# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.2D.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/david_eccles_bc_ideas/mouse_ref.fa",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment",
#     "ncores": 24,
#     "nmer": 5
# }

# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events.2D.62pA.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment.62pA",
#     "ncores": 62
# }

args = {
    "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.events.template.pickle",
    "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/ecoli_mg1655.fa",
    "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.alignment",
    "ncores": 62,
    "nmer": 6
}

args = {
    "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.events.template.pickle",
    "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/ecoli_mg1655.fa",
    "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.alignment",
    "ncores": 62,
    "nmer": 6
}

In [13]:
NMER = args["nmer"]

In [14]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [15]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["ref"])
assert args["ncores"]

### Prepare File Data

In [16]:
def prepare_file_data(file_obj):
    """normalize events and save generated called_seq (instead of some metrichor magic)"""
    file_obj = normalize_read(file_obj)
    called_seq = events2seq(file_obj["events"].to_dict("records"))
    file_obj["fastq"] = "\n".join([file_obj["fastq"].split("\n")[0], called_seq])
    return file_obj

In [17]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [18]:
p = Pool(args["ncores"])
print("Prepare file data: ")
results = []
try:
    for i, res in enumerate(p.imap_unordered(prepare_file_data, file_data), 1):
        results.append(res)
        sys.stdout.write('\rdone {0:%}'.format(i/float(len(file_data))))
    p.close()
    p.join()
except KeyboardInterrupt:
    p.terminate()
file_data = results

Prepare file data: 
done 100.000000%

### Align to reference

In [19]:
ref = load_ref(args["ref"])

['>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome']
AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAAT


In [20]:
fastq_file = "{0}.fastq".format(args["out_basename"])
mk_fastq(fastq_file, file_data)

In [21]:
sam_file = "{0}.sam".format(args["out_basename"])
graphmap(args["ref"], fastq_file, sam_file, args["ncores"])

[Index 16:03:12] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 16:03:12] Index already exists. Loading from file.
[Index 16:03:13] Secondary index already exists. Loading from file.
[Index 16:03:14] Index loaded in 1.05 sec.
[Index 16:03:14] Memory consumption: [currentRSS = 674 MB, peakRSS = 20334 MB]

[Run 16:03:14] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 16:03:14] Reference genome is assumed to be linear.
[Run 16:03:14] Only one alignment will be reported per mapped read.
[ProcessReads 16:03:14] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 16:03:14] Batch of 5000 reads (46 MiB) loaded in 0.49 sec. (32134376 bases)
[ProcessReads 16:03:14] Memory consumption: [currentRSS = 721 MB, peakRSS = 20334 MB]
[ProcessReads 16:03:14] Using 62 threads.
[ProcessReads 16:04:40] [CPU time: 1788.88 sec, RSS: 793 MB] Read: 5000/5000 (100.00%) [m: 4967, u: 33]   

In [22]:
prepare_sam(args["out_basename"])

'/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.alignment.sorted.bam'

In [23]:
samfile = pysam.AlignmentFile("{0}.sorted.bam".format(args["out_basename"]))
samreads = {}

In [24]:
print(len(file_data))

5000


In [25]:
prepare_filemap(file_data)

In [26]:
reads  = [x for x in samfile.fetch()]
len(reads)

4967

In [27]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer

In [28]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [29]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)

In [30]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [31]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False

In [32]:
def process_events(pairs, file_obj):
    event_list = file_obj["events"].to_dict("records")
    called_seq = file_obj["fastq"].split("\n")[1]
    i_seq = 0
    correct = []
    for ev in event_list:
        ev["channel"] = file_obj["channel"]
        ev["file_id"] = file_obj["file_id"]
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)

        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], called_seq)
        assert(read_kmer == ev_kmer), (i_seq, ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, called_seq, ref):
            ev["ref_position"] = pairs[1][ev_index[0]] #first position of kmer in reference
            correct.append(ev)
    return correct

In [33]:
total_events = 0

p = Pool(args["ncores"])

result = []
try:
    for i, read in enumerate(reads):
        file_id, channel_id = get_file_and_channel(read.query_name)
        print(i, file_id, channel_id, read.query_name)
        pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
        file_obj = get_file(channel_id, file_id)
        assert(pairs[0][0] == 0), "alignment is not null-indexed."

        ## map read to events
        total_events += len(file_obj["events"].index)
    #     total_events += len(file_obj["events"])


        result.append(p.apply_async(process_events,[pairs, file_obj]))
#         result.append(process_events(pairs, file_obj))

except KeyboardInterrupt:
    p.terminate()

0 93 110 ch110_file93_read
1 38 142 ch142_file38_read
2 178 186 ch186_file178_read
3 23 147 ch147_file23_read
4 28 183 ch183_file28_read
5 107 174 ch174_file107_read
6 29 174 ch174_file29_read
7 10 15 ch15_file10_read
8 49 173 ch173_file49_read
9 63 110 ch110_file63_read
10 38 174 ch174_file38_read
11 48 143 ch143_file48_read
12 45 113 ch113_file45_read
13 21 143 ch143_file21_read
14 11 190 ch190_file11_read
15 9 174 ch174_file9_read
16 51 110 ch110_file51_read
17 106 174 ch174_file106_read
18 128 110 ch110_file128_read
19 4 160 ch160_file4_read
20 15 147 ch147_file15_read
21 18 143 ch143_file18_read
22 12 110 ch110_file12_read
23 45 11 ch11_file45_read
24 21 174 ch174_file21_read
25 77 110 ch110_file77_read
26 7 110 ch110_file7_read
27 102 174 ch174_file102_read
28 61 165 ch165_file61_read
29 80 110 ch110_file80_read
30 24 183 ch183_file24_read
31 39 123 ch123_file39_read
32 23 110 ch110_file23_read
33 1 117 ch117_file1_read
34 93 183 ch183_file93_read
35 1 128 ch128_file1_read
36 40 

In [34]:
true_events = list(chain.from_iterable([r.get() for r in result]))

In [35]:
len(true_events)

5454308

In [36]:
print(total_events)

49467393


In [37]:
print(len(true_events)/total_events)

0.11026067211587237


In [38]:
pickle.dump(true_events,
            open("{0}_true_events.pickle".format(args["out_basename"]), 'wb'))