In [None]:
from nbwrapper import getargs
from pprint import pprint
from multiprocessing import Pool, Value

import re
import numpy as np
import pickle
from itertools import repeat, count, product, chain
import sys
import pysam
from math import floor
import os.path
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/alignment_lib.ipynb"
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/analysis_lib.ipynb"

In [None]:
args = getargs()

In [None]:
## for testing only
# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.2D.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/david_eccles_bc_ideas/mouse_ref.fa",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment",
#     "ncores": 24,
#     "nmer": 5
# }

# args = {
#     "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events.2D.62pA.pickle",
#     "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
#     "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_alignment.62pA",
#     "ncores": 62
# }


args = {
    "events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.events.2D.pickle",
    "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/ecoli_mg1655.fa",
    "out_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.alignment",
    "ncores": 62,
    "nmer": 6
}

In [None]:
NMER = args["nmer"]

In [None]:
!pwd

In [None]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["ref"])
assert args["ncores"]

In [None]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [None]:
ref = load_ref(args["ref"])

In [None]:
fastq_file = "{0}.fastq".format(args["out_basename"])
mk_fastq(fastq_file, file_data)

In [None]:
sam_file = "{0}.sam".format(args["out_basename"])
graphmap(args["ref"], fastq_file, sam_file, args["ncores"])

In [None]:
prepare_sam(args["out_basename"])

In [None]:
samfile = pysam.AlignmentFile("{0}.sorted.bam".format(args["out_basename"]))
samreads = {}

In [None]:
print(len(file_data))

In [None]:
prepare_filemap(file_data)

In [None]:
reads  = [x for x in samfile.fetch()]
len(reads)

In [None]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [None]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [None]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [None]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [None]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [None]:
def process_events(pairs, file_obj):
    event_list = file_obj["events"].to_dict("records")
#     event_list = file_obj["events"]
    called_seq = file_obj["fastq"].split("\n")[1]
    i_seq = 0
    correct = []
    for ev in event_list:
        ev["channel"] = file_obj["channel"]
        ev["file_id"] = file_obj["file_id"]
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)

        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
#         read_kmer = get_nt_kmer(ev_index, pairs[0], called_seq)
#         assert(read_kmer == ev_kmer), (i_seq, ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, called_seq, ref):
            ev["ref_position"] = pairs[1][ev_index[floor(NMER/2)]] #first position of kmer in reference
            correct.append(ev)
    return correct

In [None]:
total_events = 0

p = Pool(args["ncores"])

result = []
try:
    for i, read in enumerate(reads):
        file_id, channel_id = get_file_and_channel(read.query_name)
        print(i, file_id, channel_id, read.query_name)
        pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
        file_obj = get_file(channel_id, file_id)
        assert(pairs[0][0] == 0), "alignment is not null-indexed."

        ## map read to events
        total_events += len(file_obj["events"].index)
    #     total_events += len(file_obj["events"])


        result.append(p.apply_async(process_events,[pairs, file_obj]))
#         result.append(process_events(pairs, file_obj))

except KeyboardInterrupt:
    p.terminate()    
    

In [None]:
true_events = list(chain.from_iterable([r.get() for r in result]))

In [None]:
len(true_events)

In [None]:
print(total_events)

In [None]:
print(len(true_events)/total_events)

In [None]:
pickle.dump(true_events,
            open("{0}_true_events.pickle".format(args["out_basename"]), 'wb'))