In [5]:
from nbwrapper import getargs
from multiprocessing import Pool
import pickle
import h5py
import pandas
from pprint import pprint
import re
import os.path
import sys
from itertools import repeat

In [6]:
args = getargs()



In [7]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [8]:
### for testing only
args = {
    "f5_path": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/processed/", ## path to processed f5-files
    "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events", 
    "ncores": 24,
}
# args = {
#     "f5_path": "/storageNGS/ngs1/projects/other/NanoporeData/burnin_downloads/", ## path to processed f5-files
#     "output_basename": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events", 
#     "ncores": 60,
# }

In [9]:
assert os.path.isdir(args["f5_path"])
args["ncores"] = int(args["ncores"])
assert args["ncores"]

In [10]:
SRATE = 5000
NMERS = 5
TYPES = ["template", "complement", "2D"]


In [11]:
files = !find {args["f5_path"]} | grep fast5
print(len(files))

6434


In [12]:
def get_file_and_channel(filename):
    result = re.search(r'ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

In [13]:
def kmer2move(prev_kmer, curr_kmer):
    """calculates the shift between two kmers. 
    If multiple shifts are possible (repeats), 
    the minimal possible shift is assumed."""
    if(prev_kmer is None): return 0 #first position 
    assert len(prev_kmer) == len(curr_kmer)
    l = len(prev_kmer)
    for i in range(0, l): 
        if prev_kmer[i:] == curr_kmer[:l-i]:
            return i

In [14]:
def process_metrichor_file(file):
    """
    reads every filename in files with poretools and 
    extracts events and metadata
    """  
    
    tmp_out = []
    
    def log(tmp_out):
        print("\n".join(tmp_out) + "\n")
        sys.stdout.flush()        
        
    file_id, ch_id = get_file_and_channel(file)
    tmp_out.append("processing file {0} channel {1}".format(
        file_id, ch_id))

    f5 = h5py.File(file, 'r')
    fastq = {}
    for t in TYPES: 
        try: 
            fastq[t] = bytes(f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Fastq'.format(t)][...]).decode('utf-8')
        except KeyError:
            fastq[t] = None           
           
    def process_1d(t): 
        try:
            raw_events = f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Events'.format(t)] 
        except KeyError:
            return None
        events = []
        tmp_out.append("  >> processing {0} events".format(t))
        for raw_ev in raw_events: 
            ev = {}
            ev["mean"] = raw_ev[0]
            ev["start"] = round(float(raw_ev[1]) * SRATE)
            ev["stdv"] = raw_ev[2]
            ev["end"] = round((float(raw_ev[1]) + float(raw_ev[3])) * SRATE) - 1
            ev["kmer"] = bytes(raw_ev[4]).decode('utf-8')
            ev["move"] = int(raw_ev[6])
            events.append(ev)
        return events   
        
            
    def process_2d(all_events):
        """generate 2d read from alignment and the 1d reads"""
        try:
            aln = f5['/Analyses/Basecall_2D_000/BaseCalled_2D/Alignment']   
        except KeyError:
            return None
        tmp_out.append("  >> processing {0} events".format("2D"))
        events = []
        prev_kmer = None
        for pos in aln:  
            ids = {"template": pos[0], "complement": pos[1]}
            kmer = pos[2]
            move = kmer2move(prev_kmer, kmer)
            prev_kmer = kmer
            ev = {}
            ev["move"] = move
            ev["kmer"] = bytes(kmer).decode('utf-8')
            for t, tmp_id in ids.items():  
                for f in ["mean", "start", "stdv", "end"]:
                    ev["{0}.{1}".format(t, f)] = None if tmp_id < 0 else all_events[t][tmp_id][f]                    
            events.append(ev)
        return events
           
   
      

    all_events = {t: process_1d(t) for t in ["template", "complement"]}
    all_events["2D"] = process_2d(all_events)
    all_events = {t: pandas.DataFrame(events) for t, events in all_events.items()}
      
    
    f_obj = {
        "channel": ch_id,
        "file_id": file_id, 
        "events": all_events,
        "fastq": fastq
    }
    log(tmp_out)
    return f_obj
        


In [15]:
p = Pool(args["ncores"])

In [16]:
try:
    file_data = p.map(process_metrichor_file, files)
except KeyboardInterrupt:
    p.terminate()

processing file 25 channel 154
processing file 6 channel 190
processing file 5 channel 102
processing file 4 channel 179
  >> processing template events
processing file 10 channel 216
  >> processing template events
processing file 13 channel 188
  >> processing template events
processing file 24 channel 236
  >> processing template events
processing file 18 channel 235
  >> processing template events
processing file 11 channel 20
  >> processing template events
  >> processing complement events
processing file 1 channel 219
  >> processing template events
processing file 6 channel 112
  >> processing template events
processing file 8 channel 148
  >> processing template events
  >> processing complement events
  >> processing 2D events
processing file 22 channel 140
  >> processing template events
processing file 11 channel 114
  >> processing template events
processing file 9 channel 108
  >> processing template events
  >> processing complement events
  >> processing 2D events
proce

In [17]:
for t in TYPES:
    tmp_file_data = []
    for f_obj in file_data:
        if f_obj["events"][t] is not None and f_obj["fastq"][t] is not None:
            tmp_f_obj = {
                "channel": f_obj["channel"],
                "file_id": f_obj["file_id"],
                "events": f_obj["events"][t],
                "fastq": f_obj["fastq"][t]
            }
            tmp_file_data.append(tmp_f_obj)       
    filename = "{0}.{1}.pickle".format(args["output_basename"], t)
    pickle.dump(tmp_file_data, open(filename, 'wb'), protocol=2)

processing file 3 channel 154
  >> processing template events
  >> processing complement events
processing file 0 channel 191
  >> processing template events
  >> processing complement events
  >> processing 2D events
processing file 1 channel 103
  >> processing template events
  >> processing complement events
  >> processing 2D events
processing file 8 channel 179
  >> processing template events
  >> processing complement events
  >> processing 2D events
processing file 15 channel 216
  >> processing template events
  >> processing complement events
  >> processing 2D events
processing file 16 channel 188
  >> processing template events
processing file 27 channel 236
  >> processing template events
  >> processing complement events
  >> processing 2D events
processing file 20 channel 235
  >> processing template events
processing file 14 channel 20
  >> processing template events
processing file 22 channel 219
  >> processing template events
processing file 0 channel 113
  >> proces