## Make Events
Load events from a folder of fast5-files and store them in one data file for further processing. 

In [24]:
from nbwrapper import getargs
from multiprocessing import Pool
import pickle
import marshal
import h5py
import pandas
from pprint import pprint
import re
import os.path
import sys
from itertools import repeat
import numpy as np

In [25]:
args = getargs()



In [26]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [27]:
# args = {
#     "f5_path": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/MAP006-1_100/", ## path to processed f5-files
#     "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.events", 
#     "ncores": 60,
#     "nmers": 6
# }
args = {
    "f5_path": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/wouter_de_coster_map-006_lambda/Lambda006_20151117/fast5_100/", ## path to processed f5-files
    "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/wouter_lambda006_100.events", 
    "ncores": 60,
    "nmers": 6,
    "new_file_format": True
}
# args = {
#     "f5_path": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/MAP006-1_5000/", ## path to processed f5-files
#     "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.events", 
#     "ncores": 80,
#     "nmers": 6
# }
# args = {
#     "f5_path": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/MAP006-1/", ## path to processed f5-files
#     "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1.events", 
#     "ncores": 80,
#     "nmers": 6
# }

In [28]:
assert os.path.isdir(args["f5_path"])
args["ncores"] = int(args["ncores"])
assert args["ncores"]

In [29]:
SRATE = 5000
NMERS = args["nmers"]
TYPES = ["template", "complement", "2D"]


In [30]:
files = !find {args["f5_path"]} | grep "\.fast5"
print(len(files))

100


In [31]:
files = sorted(files)

In [42]:
def get_file_and_channel(filename):
    result = re.search(r'ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

In [43]:
def kmer2move(prev_kmer, curr_kmer):
    """calculates the shift between two kmers. 
    If multiple shifts are possible (repeats), 
    the minimal possible shift is assumed."""
    if(prev_kmer is None): return 0 #first position 
    assert len(prev_kmer) == len(curr_kmer)
    l = len(prev_kmer)
    for i in range(0, l): 
        if prev_kmer[i:] == curr_kmer[:l-i]:
            return i

In [44]:
def get_models(f5):
    log = f5['/Analyses/Basecall_2D_000/Log']
    log = bytes(log[...]).decode("utf-8").split("\n")
    model = [x for x in log if x.find(".model") >= 0]
    model = [re.search(r"\"(.*)\"", x).group(1) for x in model]
    models = {}
    for t in ["template", "complement"]:
        try:
            m_name = [x for x in model if x.find(t) >= 0][0]
        except IndexError:
            m_name = None
        models[t] = m_name
    return models

In [45]:
if "new_file_format" in args and args["new_file_format"]: 
    f5_event_path = '/Analyses/Basecall_1D_000/BaseCalled_{0}'
else:
    f5_event_path = '/Analyses/Basecall_2D_000/BaseCalled_{0}'

In [55]:
def process_metrichor_file(file):
    """
    reads every filename in files with poretools and 
    extracts events and metadata
    """          
    file_id, ch_id = get_file_and_channel(file)

    try:
        f5 = h5py.File(file, 'r')
    except OSError:
        raise Exception("Unablable to open file.")

    ## extract fastq
    fastq = {}
    for t in TYPES: 
        try: 
            fastq[t] = bytes(f5[(f5_event_path + '/Fastq').format(t)][...]).decode('utf-8')
        except KeyError:
            fastq[t] = None           
           
    def process_1d(t): 
        try:
            raw_events = f5[(f5_event_path + '/Events').format(t)] 
        except KeyError:
            return None
        events = []
        for raw_ev in raw_events: 
            ev = {}
            ev["mean"] = raw_ev[0]
            ev["start"] = float(raw_ev[1])
            ev["stdv"] = raw_ev[2]
            ev["length"] = float(raw_ev[3])
            ev["kmer"] = bytes(raw_ev[4]).decode('utf-8')
            ev["move"] = int(raw_ev[6])
            events.append(ev)
        return events   
        
            
    def process_2d(all_events):
        """generate 2d read from alignment and the 1d reads"""
        try:
            aln = f5['/Analyses/Basecall_2D_000/BaseCalled_2D/Alignment']   
        except KeyError:
            return None
        events = []
        prev_kmer = None
        for pos in aln:  
            ids = {"template": pos[0], "complement": pos[1]}
            kmer = pos[2]
            move = kmer2move(prev_kmer, kmer)
            prev_kmer = kmer
            ev = {}
            ev["move"] = move
            ev["kmer"] = bytes(kmer).decode('utf-8')
            for t, tmp_id in ids.items():  
                for f in ["mean", "start", "stdv", "length"]:
                    ev["{0}.{1}".format(t, f)] = None if tmp_id < 0 else all_events[t][tmp_id][f]                    
            events.append(ev)
        return events

    all_events = {t: process_1d(t) for t in ["template", "complement"]}
    all_events["2D"] = process_2d(all_events)
    all_events = {t: pandas.DataFrame(events) for t, events in all_events.items()}
  
    models = get_models(f5)
    median = {t: all_events[t]["mean"].median() for t in ["template", "complement"] if not all_events[t].empty}
    #median = {t: np.median([x["mean"] for x in all_events[t]]) for t in ["template", "complement"] if all_events[t] is not None}

    try:
        template_attrs = f5[f5_event_path.format("template") + "/Model"].attrs
    except KeyError: 
        return None
    
    f_obj = {
        "channel": ch_id,
        "file_id": file_id, 
        "events": all_events,
        "models": models,
        "median": median, 
        "shift_template": template_attrs["shift"],
        "scale_template": template_attrs["scale"],
        "drift_template": template_attrs["drift"],
        "fastq": fastq
    }
    return f_obj
        


In [56]:
files[0]
f5_event_path

'/Analyses/Basecall_1D_000/BaseCalled_{0}'

In [57]:
f5 = h5py.File(files[0], 'r')

In [58]:
p = Pool(args["ncores"])

In [59]:
print("Make Events: ")
file_data = []
try:
    for i, res in enumerate(p.imap_unordered(process_metrichor_file, files), 1):
        file_data.append(res)
        sys.stdout.write('\rdone {0:%}'.format(i/float(len(files))))
    p.close()
    p.join()
except KeyboardInterrupt:
    p.terminate()

Make Events: 
done 100.000000%

In [61]:
for t in TYPES:
    tmp_file_data = []
    for f_obj in file_data:
        if f_obj is None: continue
        if f_obj["events"][t] is not None and f_obj["fastq"][t] is not None:
            tmp_f_obj = {
                "models": f_obj["models"],
                "median": f_obj["median"],
                "channel": f_obj["channel"],
                "file_id": f_obj["file_id"],
                "events": f_obj["events"][t],
                "fastq": f_obj["fastq"][t],
                "shift_template": f_obj["shift_template"],
                "scale_template": f_obj["scale_template"],
                "drift_template": f_obj["drift_template"],
            }
            tmp_file_data.append(tmp_f_obj)       
    filename = "{0}.{1}.pickle".format(args["output_basename"], t)
    pickle.dump(tmp_file_data, open(filename, 'wb'), protocol=2)

In [None]:
# for t in TYPES:
#     tmp_file_data = []
#     for f_obj in file_data:
#         if f_obj["events"][t] is not None and f_obj["fastq"][t] is not None:
#             tmp_f_obj = {
#                 "models": f_obj["models"],
#                 "median": f_obj["median"],
#                 "channel": f_obj["channel"],
#                 "file_id": f_obj["file_id"],
#                 "events": f_obj["events"][t],
#                 "fastq": f_obj["fastq"][t]
#             }
#             tmp_file_data.append([(k, v) for k, v in tmp_f_obj.items()])       
#     filename = "{0}.{1}.marshal".format(args["output_basename"], t)
#     marshal.dump(tuple(tmp_file_data), open(filename, 'wb'))