In [1]:
from nbwrapper import getargs
from multiprocessing import Pool
import pickle
import h5py
import pandas
from pprint import pprint
import re
import os.path

In [2]:
args = getargs()



In [3]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks/02_raw-experiments/david_analysis


In [4]:
### for testing only
args = {
    "f5_path": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/processed/", ## path to processed f5-files
    "output": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.pickle", 
    "ncores": 24,
}

In [5]:
assert os.path.isdir(args["f5_path"])
args["ncores"] = int(args["ncores"])
assert args["ncores"]

In [6]:
SRATE = 5000
NMERS = 5

In [7]:
files = !find {args["f5_path"]} | grep fast5
print(len(files))

81


In [8]:
def get_file_and_channel(filename):
    result = re.search(r'ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

In [9]:
def kmer2move(prev_kmer, curr_kmer):
    """calculates the shift between two kmers. 
    If multiple shifts are possible (repeats), 
    the minimal possible shift is assumed."""
    if(prev_kmer is None): return 0 #first position 
    assert len(prev_kmer) == len(curr_kmer)
    l = len(prev_kmer)
    for i in range(0, l): 
        if prev_kmer[i:] == curr_kmer[:l-i]:
            return i

In [10]:
def process_metrichor_file(file):
    """
    reads every filename in files with poretools and 
    extracts events and metadata
    """    

    tmp_out = []
    types = ["template", "complement", "2D"]
    file_id, ch_id = get_file_and_channel(file)
    tmp_out.append("processing file {0} channel {1}".format(
        file_id, ch_id))

    f5 = h5py.File(file, 'r')
    fastq = {}
    for t in types: 
        try: 
            fastq[t] = bytes(f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Fastq'.format(t)][...]).decode('utf-8')
        except KeyError:
            fastq[t] = None

    if(all([not x for x in fastq.values()])): 
        tmp_out.append("\tempty file, skipped. ")
        print("\n".join(tmp_out) + "\n")
        return None

    events = {}
    for t in ["template", "complement"]: 
        if fastq[t]:
            events[t] = f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Events'.format(t)]

    if fastq["2D"]:
        """2d read"""
        aln = f5['/Analyses/Basecall_2D_000/BaseCalled_2D/Alignment']   
        events["2D"] = []
        prev_kmer = None
        for pos in aln:  
            ids = {}
            ids["template"], ids["complement"], kmer = pos
            move = kmer2move(prev_kmer, kmer)
            prev_kmer = kmer
            ev = {}
            ev["move"] = move
            ev["kmer"] = bytes(kmer).decode('utf-8')
            for t, tmp_id in ids.items():  
                tmp_event = None if tmp_id < 0 else events[t][tmp_id]
                ev[t] = tmp_id
                ev["{0}.mean".format(t)] = None if tmp_event is None else tmp_event[0]
                ev["{0}.start".format(t)] = None if tmp_event is None else round(float(tmp_event[1]) * SRATE)
                ev["{0}.stdv".format(t)] = None if tmp_event is None else tmp_event[2]
                ev["{0}.end".format(t)] = None if tmp_event is None else (
                            round((float(tmp_event[1]) + float(tmp_event[3])) * SRATE)-1)

            events["2D"].append(ev)

    else: 
        """1d read(s) only"""
        ## not implemented
        tmp_out.append("\tno 2d reads, skipped.")
        print("\n".join(tmp_out) + "\n")
        return None


    f_obj = {
        "channel": ch_id,
        "file_id": file_id, 
        "events": pandas.DataFrame(events["2D"]),
        "fastq": fastq["2D"]
    }
    print("\n".join(tmp_out) + "\n")
    return f_obj
        


In [11]:
p = Pool(args["ncores"])

In [12]:
try:
    file_data = p.map(process_metrichor_file, files)
except KeyboardInterrupt:
    p.terminate()

processing file 17 channel 135
	no 2d reads, skipped.
processing file 19 channel 141
	no 2d reads, skipped.
processing file 25 channel 132
	no 2d reads, skipped.
processing file 33 channel 135
	no 2d reads, skipped.
processing file 13 channel 141
	no 2d reads, skipped.
processing file 32 channel 135
	no 2d reads, skipped.
processing file 3 channel 132
	no 2d reads, skipped.
processing file 36 channel 132
processing file 15 channel 141
processing file 12 channel 141
processing file 23 channel 135
processing file 23 channel 132
processing file 44 channel 132
processing file 8 channel 141
processing file 19 channel 135
processing file 0 channel 135
processing file 10 channel 132
processing file 11 channel 133
processing file 26 channel 132
processing file 25 channel 135
processing file 2 channel 133
processing file 17 channel 132
processing file 38 channel 132
processing file 35 channel 132
























processing file 3 channel 142
processing file 12 channel 149
	no 2d reads, 

In [13]:
pickle.dump(file_data, open(args["output"], 'wb'))