In [3]:
import re

In [2]:
def length_standard(n):
    out = []
    for i in range(n):
        if i % 10 == 0:
            out.append((i - 100*(i/100.)) / 10.)
        elif i % 5 == 0:
            out.append(":")
        else:
            out.append(".")
    return "".join(map(str, out))

# Fast 5 data reading routines 

In [4]:
def get_file_and_channel(filename):
    result = re.search(r'_ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

####Events-Table
file	strand	mean	start	stdv	length	model_state	model_level	move	p_model_state	mp_model_state	p_mp_model_state	p_A	p_C	p_G	p_T	raw_index

In [None]:
def process_metrichor_files(files):
    """
    reads every filename in files with poretools and 
    extracts events and metadata
    """
    
    file_data = []
    for file in files: 
        f_obj = {}
        file_id, ch_id = get_file_and_channel(file)
        f_obj["channel"] = ch_id
        f_obj["file_id"] = file_id

        called_seq = !poretools fasta {file} | grep template -A1 | grep -v ">"
        if not called_seq: 
            print("Empty file: {0}. Skipped.".format(file))
            continue
        f_obj["called_seq"] = called_seq[0]

        events = !poretools events {file} | grep template
        events = [line.split("\t") for line in events]

        file_events = []
        for ev in events: 
            r_ev = dict()
            r_ev["start"] = round(float(ev[3]) * SRATE)
            r_ev["end"] = round(r_ev["start"] + float(ev[5]) * SRATE)
            r_ev["mp_kmer"] = ev[10] #maximum posterior
            r_ev["kmer"] = ev[6] #model state
            r_ev["move"] = int(ev[8])
            file_events.append(r_ev)

        ## check if raw data is available
        file_id, ch_id = get_file_and_channel(file)

        f_obj["channel"] = ch_id
        f_obj["file_id"] = file_id
        f_obj["events"] = file_events
        file_data.append(f_obj)
        
    return file_data
    
    

# Align to Reference
## Find kmers and do sequence statistics

In [None]:
def find_seq_start(events, start):
    """find the kmer that corresponds to the first kmer of the aligned sequences"""
    moved = 0
    for i, ev in enumerate(events):
        if moved + ev["move"] >= start: 
            offset = moved - start
            return i, offset
        else:
            moved += ev["move"]

In [None]:
def gapmove(i_seq, ts, move):
    """moves by *move* steps, respecting gaps."""
    moved = 0
    if i_seq < 0:
        return move  
    for i, c in enumerate(ts[i_seq:]): 
        if moved == move:
            return i
        if c != "-": moved +=1
        
            

In [None]:
def print_alignment_statistics(alignment, aln_stats):
    total_diff, total_target_length = aln_stats
    print("    Query: start: {0}, end {1}, length {2}".format(alignment.query_begin, alignment.query_end, len(alignment.query_sequence)))
    t_beg = alignment.target_begin
    t_end = alignment.target_end_optimal
    t_diff = len(alignment.target_sequence) - (t_end - t_beg)
    total_diff += t_diff
    total_target_length += len(alignment.target_sequence)
    print("    Target: start: {0}, end {1}, length {2}, skipped {3}".format(alignment.target_begin, alignment.target_end_optimal, len(alignment.target_sequence), t_diff))
    aln_stats = (total_diff, total_target_length)
    return aln_stats

In [None]:
def sequence_identity(alignment):
    """ get the sequence identity of the alignment (correctly mapped, target-length)"""
    correctly_mapped = 0
    aqs = str(alignment.aligned_query_sequence)
    ats = str(alignment.aligned_target_sequence)
    for i in range(len(alignment.aligned_query_sequence)):
        if aqs[i] == ats[i]:
            correctly_mapped += 1
    return (correctly_mapped, len(alignment.target_sequence))
            