In [None]:
import re
%load_ext autoreload
%autoreload 1

In [None]:
def length_standard(n):
    out = []
    for i in range(n):
        if i % 10 == 0:
            out.append((i - 100*(i/100.)) / 10.)
        elif i % 5 == 0:
            out.append(":")
        else:
            out.append(".")
    return "".join(map(str, out))

# Fast 5 data reading routines 

In [None]:
import h5py
import numpy as np
import pandas as pd
from pprint import pprint
import sys
sys.path.append("../../../tools/fast5py/")

In [None]:
def get_file_and_channel(filename):
    result = re.search(r'ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

####Events-Table
file	strand	mean	start	stdv	length	model_state	model_level	move	p_model_state	mp_model_state	p_mp_model_state	p_A	p_C	p_G	p_T	raw_index

only look at the "template" seq for the beginning


In [None]:
def kmer2move(prev_kmer, curr_kmer):
    """calculates the shift between two kmers. 
    If multiple shifts are possible (repeats), 
    the minimal possible shift is assumed."""
    if(prev_kmer is None): return 0 #first position 
    assert len(prev_kmer) == len(curr_kmer)
    l = len(prev_kmer)
    for i in range(0, l): 
        if prev_kmer[i:] == curr_kmer[:l-i]:
            return i

def process_metrichor_files(files, verbose=False):
    """
    reads every filename in files with poretools and 
    extracts events and metadata
    """
    
    file_data = []
    for file in files: 
        types = ["template", "complement", "2D"]
        file_id, ch_id = get_file_and_channel(file)
        if (verbose): 
            print("processing file {0} channel {1}".format(
                file_id, ch_id))

        f5 = h5py.File(file, 'r')
        fastq = {}
        for t in types: 
            try: 
                fastq[t] = bytes(f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Fastq'.format(t)][...]).decode('utf-8')
            except KeyError:
                fastq[t] = None
                
        if(all([not x for x in fastq.values()])): 
            print("empty file, skipped. ")
            continue
            
        events = {}
        for t in ["template", "complement"]: 
            if fastq[t]:
                events[t] = f5['/Analyses/Basecall_2D_000/BaseCalled_{0}/Events'.format(t)]
                
        if fastq["2D"]:
            """2d read"""
            aln = f5['/Analyses/Basecall_2D_000/BaseCalled_2D/Alignment']   
            events["2D"] = []
            prev_kmer = None
            for pos in aln:  
                ids = {}
                ids["template"], ids["complement"], kmer = pos
                move = kmer2move(prev_kmer, kmer)
                prev_kmer = kmer
                ev = {}
                ev["move"] = move
                ev["kmer"] = bytes(kmer).decode('utf-8')
                for t, tmp_id in ids.items():  
                    tmp_event = None if tmp_id < 0 else events[t][tmp_id]
                    ev[t] = tmp_id
                    ev["{0}.mean".format(t)] = None if tmp_event is None else tmp_event[0]
                    ev["{0}.start".format(t)] = None if tmp_event is None else round(float(tmp_event[1]) * SRATE)
                    ev["{0}.stdv".format(t)] = None if tmp_event is None else tmp_event[2]
                    ev["{0}.end".format(t)] = None if tmp_event is None else (
                                round((float(tmp_event[1]) + float(tmp_event[3])) * SRATE)-1)
                    
                events["2D"].append(ev)
            
        else: 
            """1d read(s) only"""
            ## not implemented
            print("no 2d reads, skipped.")
            continue
            
            
        f_obj = {
            "channel": ch_id,
            "file_id": file_id, 
            "events": pd.DataFrame(events["2D"]),
            "fastq": fastq["2D"]
        }
        file_data.append(f_obj)
        
    return file_data

# Alingment tools

In [None]:
def prepare_sam(basename):
    !samtools view -S -b {basename}.sam > {basename}.bam
    !samtools sort {basename}.bam {basename}.sorted
    !samtools index {basename}.sorted.bam