In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import h5py
import numpy as np

In [None]:
labelBaseMap = {
    0: "A",
    1: "C",
    2: "G",
    3: "T"
}

def get_reads_dict(filename):
    file = h5py.File(filename, "r")
    file = file['Reads']
    reads = []
    for r in file.keys():
        elem = {}
        elem['UUID'] = r
        for k in file[r].keys():
            elem[k]=file[r][k][()]
        reads.append(elem)
    return reads

def make_dna_list(read):
    dnalist = []
    pos = 0
    signalrefs = list(read['Ref_to_signal'])
    dnaref = list(read['Reference'])
    for idx in range(len(signalrefs)):
        if idx == 0:
            dnalist.extend([-1]*signalrefs[idx])
        else:
            for i in range(signalrefs[idx]-signalrefs[idx-1]):
                dnalist.append(dnaref[idx-1])
    return dnalist

def makeOHE(dnalist):
    ohes = {}
    for i in range(4):
        ohes[labelBaseMap[i]] = list(map(lambda x: 1 if x==i else 0, dnalist))
    return ohes

def get_labels(file, read_id):
    reads = get_reads_dict(file)
    dnalist = make_dna_list(reads[read_id])
    return makeOHE(dnalist)

In [None]:
read_id = 0

labels_noBase_noMap = get_labels("./../../taiyakiOutputs/output_justfromfasta.hdf5", read_id)
labels_base_noMap   = get_labels("./../../taiyakiOutputs/output_flappie_nomap.hdf5", read_id)
labels_noBase_map   = get_labels("./../../taiyakiOutputs/output_createfasta.hdf5", read_id)
labels_base_map     = get_labels("./../../taiyakiOutputs/output_flappiealigned.hdf5", read_id)

In [None]:
def ohe_to_list(data):
    length = len(data['T'])
    res = []
    i = 0
    
    while(i < length):
        if(data['T'][i] == 1):
            res.append('T')
        elif(data['A'][i] == 1):
            res.append('A')
        elif(data['C'][i] == 1):
            res.append('C')
        elif(data['G'][i] == 1):
            res.append('G')
        else:
            res.append('.')     
        i = i + 1
    return res

def compare(s1, s2, include_offset, print_output):
    c = 0
    length = 0
    i = 0;
    output = ""
    
    if(len(s1) > len(s2)):
        length = len(s2)
    else:
        length = len(s1)

    while(i < length):        
        v1 = s1[i]
        v2 = s2[i]
        
        if(v1 != v2):
            if(v1 == '.' or v2 == '.'):
                if(include_offset):
                    output = "WRONG: {} {}".format(v1, v2)
                    c+=1
            else:
                output = "WRONG: {} {}".format(v1, v2)
                c+=1
        else:
            if(v1 == '.' or v2 == '.'):
                if(include_offset):
                    output = "RIGHT: {} {}".format(v1, v2)
            else:
                output = "RIGHT: {} {}".format(v1,v2)
        i+=1
        if(print_output and output != ""):
            print(output)
    return c

In [None]:
data_noBase_noMap = ohe_to_list(labels_noBase_noMap)
data_base_noMap   = ohe_to_list(labels_base_noMap)
data_noBase_map   = ohe_to_list(labels_noBase_map)
data_base_map     = ohe_to_list(labels_base_map)

print("Exmaple of sequence, before signal has bases:")
print(data_noBase_noMap[810:830])
print("Exmaple of sequence with labels:")
print(data_noBase_noMap[1000:1020])

In [None]:
#TESTS
print('Test 1')
s1 = ['.','.','.','A','T']
s2 = ['.','.','C','A','T']
print(s1)
print(s2)
rs = compare(s1, s2, True, True)
print("Expected: 1, Actual: {}".format(rs))
print("------------------------------------------------")

print('Test 2')
s1 = ['.','.','.','A','T']
s2 = ['.','.','C','A','T']
print(s1)
print(s2)
rs = compare(s1, s2, False, True)
print("Expected: 0, Actual: {}".format(rs))
print("------------------------------------------------")

print('Test 3')
s1 = ['.','.','.','A','T', 'G', 'C', 'A']
s2 = ['.','.','C','A','T', 'C', 'C', 'T']
print(s1)
print(s2)
rs = compare(s1, s2, False, True)
print("Expected: 2, Actual: {}".format(rs))
print("------------------------------------------------")

print('Test 4')
s1 = ['.','.','.','A','T', 'G', 'C', 'A','A']
s2 = ['.','.','C','A','T', 'C', 'C', 'T','.','.','A']
print(s1)
print(s2)
rs = compare(s1, s2, True, True)
print("Expected: 4, Actual: {}".format(rs))
print("------------------------------------------------")

print('Test 5')
s1 = ['.','.','.','A','T', 'G', 'C', 'T','A','T','G']
s2 = ['.','.','C','A','T', 'C', 'C', 'T','A','T','C']
print(s1)
print(s2)
rs = compare(s1, s2, False, True)
print("Expected: 2, Actual: {}".format(rs))
print("------------------------------------------------")

print('Test 6')
s1 = data_noBase_noMap
s2 = data_noBase_noMap
rs = compare(s1, s2, True, False)
print("Expected: 0, Actual: {}".format(rs))
print("------------------------------------------------")

In [None]:
def get_shorter_list_length(l1,l2):
    if(len(l1) > len(l2)):
        return len(l2)
    else:
        return len(l1)
    
def compare_sequences(s1, s2, include_offset, output):
    mismatches = compare(s1,s2, include_offset, output)
    total = get_shorter_list_length(s1,s2)
    print("Missmatched: {} out of: {}, {}%".format(mismatches, total, mismatches / total * 100))

In [None]:
print("No Base, No Map -------- Base, No Map")
compare_sequences(data_noBase_noMap, data_base_noMap, False, False)

print()
print("No Base, No Map -------- No Base, Map")
compare_sequences(data_noBase_noMap, data_noBase_map, False, False)

print()
print("No Base, No Map -------- Base, Map")
compare_sequences(data_noBase_noMap, data_base_map, False, False)

print()
print("Base, No Map -------- No Base, Map")
compare_sequences(data_base_noMap, data_noBase_map, False, False)

print()
print("Base, No Map -------- Base, Map")
compare_sequences(data_base_noMap, data_base_map, False, False)


print()
print("Base, Map -------- No Base, Map")
compare_sequences(data_base_map, data_noBase_map, False, False)