In [1]:
from nbwrapper import getargs
from multiprocessing import Pool
import pickle
import h5py
import pandas
from pprint import pprint
import re
import os.path
import sys
from itertools import repeat

In [2]:
args = getargs()



In [3]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [4]:
### for testing only
# args = {
#     "f5_path": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/processed/", ## path to processed f5-files
#     "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events", 
#     "ncores": 24,
# }
# args = {
#     "f5_path": "/storageNGS/ngs1/projects/other/NanoporeData/burnin_downloads/", ## path to processed f5-files
#     "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events", 
#     "ncores": 60,
# }
args = {
    "f5_path": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/MAP006-1_100/", ## path to processed f5-files
    "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.events", 
    "ncores": 60,
    "nmers": 6
}
args = {
    "f5_path": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/MAP006-1_5000/", ## path to processed f5-files
    "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_5000.events", 
    "ncores": 80,
    "nmers": 6
}

In [5]:
assert os.path.isdir(args["f5_path"])
args["ncores"] = int(args["ncores"])
assert args["ncores"]

In [6]:
SRATE = 5000
NMERS = args["nmers"]
TYPES = ["template", "complement", "2D"]


In [7]:
files = !find {args["f5_path"]} | grep fast5
print(len(files))

5000


In [8]:
def get_file_and_channel(filename):
    result = re.search(r'ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

In [9]:
def kmer2move(prev_kmer, curr_kmer):
    """calculates the shift between two kmers. 
    If multiple shifts are possible (repeats), 
    the minimal possible shift is assumed."""
    if(prev_kmer is None): return 0 #first position 
    assert len(prev_kmer) == len(curr_kmer)
    l = len(prev_kmer)
    for i in range(0, l): 
        if prev_kmer[i:] == curr_kmer[:l-i]:
            return i

In [10]:
def process_metrichor_file(file):
    """
    reads every filename in files with poretools and 
    extracts events and metadata
    """  
    
    tmp_out = []
    
    def log(tmp_out):
        pass
#         print("\n".join(tmp_out) + "\n")
#         sys.stdout.flush()        
        
    file_id, ch_id = get_file_and_channel(file)
    tmp_out.append("processing file {0} channel {1}".format(
        file_id, ch_id))

    try:
        f5 = h5py.File(file, 'r')
    except OSError:
        tmp_out.append("Unablable to open file")
        log(tmp_out)
        return None
    fastq = {}
    for t in TYPES: 
        try: 
            fastq[t] = bytes(f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Fastq'.format(t)][...]).decode('utf-8')
        except KeyError:
            fastq[t] = None           
           
    def process_1d(t): 
        try:
            raw_events = f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Events'.format(t)] 
        except KeyError:
            return None
        events = []
        tmp_out.append("  >> processing {0} events".format(t))
        for raw_ev in raw_events: 
            ev = {}
            ev["mean"] = raw_ev[0]
            ev["start"] = round(float(raw_ev[1]) * SRATE)
            ev["stdv"] = raw_ev[2]
            ev["end"] = round((float(raw_ev[1]) + float(raw_ev[3])) * SRATE) - 1
            ev["kmer"] = bytes(raw_ev[4]).decode('utf-8')
            ev["move"] = int(raw_ev[6])
            events.append(ev)
        return events   
        
            
    def process_2d(all_events):
        """generate 2d read from alignment and the 1d reads"""
        try:
            aln = f5['/Analyses/Basecall_2D_000/BaseCalled_2D/Alignment']   
        except KeyError:
            return None
        tmp_out.append("  >> processing {0} events".format("2D"))
        events = []
        prev_kmer = None
        for pos in aln:  
            ids = {"template": pos[0], "complement": pos[1]}
            kmer = pos[2]
            move = kmer2move(prev_kmer, kmer)
            prev_kmer = kmer
            ev = {}
            ev["move"] = move
            ev["kmer"] = bytes(kmer).decode('utf-8')
            for t, tmp_id in ids.items():  
                for f in ["mean", "start", "stdv", "end"]:
                    ev["{0}.{1}".format(t, f)] = None if tmp_id < 0 else all_events[t][tmp_id][f]                    
            events.append(ev)
        return events
           
   
      

    all_events = {t: process_1d(t) for t in ["template", "complement"]}
    all_events["2D"] = process_2d(all_events)
    all_events = {t: pandas.DataFrame(events) for t, events in all_events.items()}
         
    f_obj = {
        "channel": ch_id,
        "file_id": file_id, 
        "events": all_events,
        "fastq": fastq
    }
    log(tmp_out)
    return f_obj
        


In [11]:
p = Pool(args["ncores"])

In [12]:
print("Make Events: ")
file_data = []
try:
    for i, res in enumerate(p.imap_unordered(process_metrichor_file, files), 1):
        file_data.append(res)
        sys.stdout.write('\rdone {0:%}'.format(i/float(len(files))))
    p.close()
    p.join()
except KeyboardInterrupt:
    p.terminate()

Make Events: 
done 100.000000%

In [13]:
for t in TYPES:
    tmp_file_data = []
    for f_obj in file_data:
        if f_obj["events"][t] is not None and f_obj["fastq"][t] is not None:
            tmp_f_obj = {
                "channel": f_obj["channel"],
                "file_id": f_obj["file_id"],
                "events": f_obj["events"][t],
                "fastq": f_obj["fastq"][t]
            }
            tmp_file_data.append(tmp_f_obj)       
    filename = "{0}.{1}.pickle".format(args["output_basename"], t)
    pickle.dump(tmp_file_data, open(filename, 'wb'), protocol=2)