In [2]:
from ipyparallel import Client
rc = Client(profile='sge')
dview = rc[:]

OSError: Connection file '~/.ipython/profile_sge/security/ipcontroller-client.json' not found.
You have attempted to connect to an IPython Cluster but no Controller could be found.
Please double-check your configuration and ensure that a cluster is running.

In [2]:
from nbwrapper import getargs
import pickle

with dview.sync_imports():
    import h5py
    import pandas
    from pprint import pprint
    import re

importing h5py on engine(s)
importing pandas on engine(s)
importing pprint from pprint on engine(s)
importing re on engine(s)


In [3]:
args = getargs()



In [4]:
!pwd

/home/sturm/repos/uni/nanopore/own/notebooks/03_pipeline


In [5]:
### for testing only
args = {
    "f5_path": "/home/ibis/gregor.sturm/nanopore/NanoporeData/burnin_downloads/", ## path to processed f5-files
    "fasta_ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/LambdaReference/lambda_ref.fasta", ## path to reference
    "output": "file_data_lambda.pickle", 
    "verbose": True
}

In [6]:
SRATE = 5000
NMERS = 5

In [7]:
files = !find {args["f5_path"]} | grep fast5
print(len(files))

6434


In [8]:
ref = !cat {fasta_ref} | grep -v ">"
ref = ref[0]

In [9]:
dview.push(dict(args=args, ref=ref, SRATE=SRATE, NMERS=NMERS))

<AsyncResult: finished>

In [10]:
%%px
def get_file_and_channel(filename):
    result = re.search(r'ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

In [11]:
%%px
def kmer2move(prev_kmer, curr_kmer):
    """calculates the shift between two kmers. 
    If multiple shifts are possible (repeats), 
    the minimal possible shift is assumed."""
    if(prev_kmer is None): return 0 #first position 
    assert len(prev_kmer) == len(curr_kmer)
    l = len(prev_kmer)
    for i in range(0, l): 
        if prev_kmer[i:] == curr_kmer[:l-i]:
            return i

In [12]:
@dview.parallel(block=True)
def process_metrichor_file(file):
    """
    reads every filename in files with poretools and 
    extracts events and metadata
    """    

    types = ["template", "complement", "2D"]
    file_id, ch_id = get_file_and_channel(file)
    if (args["verbose"]): 
        print("processing file {0} channel {1}".format(
            file_id, ch_id))

    f5 = h5py.File(file, 'r')
    fastq = {}
    for t in types: 
        try: 
            fastq[t] = bytes(f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Fastq'.format(t)][...]).decode('utf-8')
        except KeyError:
            fastq[t] = None

    if(all([not x for x in fastq.values()])): 
        print("\tempty file, skipped. ")
        return None

    events = {}
    for t in ["template", "complement"]: 
        if fastq[t]:
            events[t] = f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Events'.format(t)]

    if fastq["2D"]:
        """2d read"""
        aln = f5['/Analyses/Basecall_2D_000/BaseCalled_2D/Alignment']   
        events["2D"] = []
        prev_kmer = None
        for pos in aln:  
            ids = {}
            ids["template"], ids["complement"], kmer = pos
            move = kmer2move(prev_kmer, kmer)
            prev_kmer = kmer
            ev = {}
            ev["move"] = move
            ev["kmer"] = bytes(kmer).decode('utf-8')
            for t, tmp_id in ids.items():  
                tmp_event = None if tmp_id < 0 else events[t][tmp_id]
                ev[t] = tmp_id
                ev["{0}.mean".format(t)] = None if tmp_event is None else tmp_event[0]
                ev["{0}.start".format(t)] = None if tmp_event is None else round(float(tmp_event[1]) * SRATE)
                ev["{0}.stdv".format(t)] = None if tmp_event is None else tmp_event[2]
                ev["{0}.end".format(t)] = None if tmp_event is None else (
                            round((float(tmp_event[1]) + float(tmp_event[3])) * SRATE)-1)

            events["2D"].append(ev)

    else: 
        """1d read(s) only"""
        ## not implemented
        print("\tno 2d reads, skipped.")
        return None


    f_obj = {
        "channel": ch_id,
        "file_id": file_id, 
        "events": pandas.DataFrame(events["2D"]),
        "fastq": fastq["2D"]
    }
    return f_obj
        


In [13]:
file_data = process_metrichor_file.map(files[:100])

In [14]:
print(file_data)

[{'file_id': 0, 'channel': 101, 'events':       complement  complement.end  complement.mean  complement.start  \
0           6333         1970734        72.609076           1970720   
1           6332         1970719        69.428828           1970645   
2           6331         1970644        73.508440           1970608   
3           6330         1970607        65.895082           1970568   
4           6329         1970567        72.529123           1970498   
5           6328         1970497        76.606084           1970388   
6           6327         1970387        68.686890           1970370   
7           6326         1970369        74.035688           1970325   
8           6325         1970324        70.677662           1970280   
9           6324         1970279        73.138730           1970195   
10          6323         1970194        72.740763           1970150   
11          6322         1970149        77.619071           1970127   
12          6321         1970126   

In [15]:
# file_data = [x for x in file_data if x is not None]