In [13]:
# Confirm the number of cigars vs samtools mapped reads value (987)

def printNumMapped(filename):

    inputfile = open(filename, "r")


    unmapped_count = 0
    mapped_count = 0
    cs_count = 0

    for line in inputfile:

        # Header lines start with '@'
        if line[0] is "@":
            continue

        fields = line.split()

        cigar = fields[5]


        if fields[5] is "*": 
            unmapped_count += 1
        else: 
            mapped_count += 1
            #if len(fields) > 20:
                #print (fields[20])

        i = 10
        while i < len(fields):
            entry = fields[i]
            if "cs:Z:" in entry: 
                cs_count += 1
                #print (entry)
            i += 1

    print (filename)
    print (unmapped_count)
    print (mapped_count)
    print (cs_count)
    print ("")
    
    return mapped_count

In [2]:
import re

def count_penalties(cs_string):
    mismatches = 0
    insertions = 0
    deletions = 0
    matches = 0
    
    fields = re.split('(\W)', cs_string)
    
    i = 1
    while i < len(fields):
        
        penalty_type = fields[i]
        penalty_info = fields[i+1]
        
        # Mismatch
        if penalty_type is "*":
            mismatches += 1
        # Insertion
        elif penalty_type is "+":
            insertions += 1
        # Deletion
        elif penalty_type is "-":
            deletions += 1
        # Match
        elif penalty_type == ":":
            matches += int(penalty_info)
            #print (penalty_info)
        
        i += 2
        
    return mismatches, insertions, deletions, matches
    

In [21]:
def summarize_penalties(filename):

    inputfile = open(filename, "r")

    total_bases = 0
    num_mismatches = 0
    num_insertions = 0
    num_deletions = 0
    num_matches = 0

    for line in inputfile:

        # Header lines start with '@'
        if line[0] is "@":
            continue

        fields = line.split()
        seq = fields[9]

        # Find cs string, if present
        # If not present, the read was not mapped
        cs_string = "None"
        i = 10
        while i < len(fields):
            entry = fields[i]
            if "cs:Z:" in entry: 
                cs_string = entry
                break
            i += 1

        # If read is mapped, count bases and penalties
        if cs_string is not "None":
            total_bases += len(seq)

            accuracy_info = cs_string.split("cs:Z:")[1]

            mis, ins, delet, match = count_penalties(accuracy_info)

            num_mismatches += mis
            num_insertions += ins
            num_deletions += delet
            num_matches += match



    # Error Rates
    print ("Mismatch Rate: " + str(float(num_mismatches)/float(total_bases)))
    print ("Insertion Rate: " + str(float(num_insertions)/float(total_bases)))
    print ("Deletion Rate: " + str(float(num_deletions)/float(total_bases)))

    # This considers the length of insertions and deletions
    print ()
    print ("Num Matches: " + str(num_matches))
    print ("Total Bases: " + str(total_bases))
    print ("Match Rate: " + str(float(num_matches)/float(total_bases)))

    # This considers just the amount of insertions and deletions
    all_types = num_deletions + num_insertions + num_mismatches + num_matches
    match_rate_v2 = num_matches / float(all_types)
    print ()
    print ("Match Rate V2: " + str(match_rate_v2))
    
    return (all_types, num_matches, num_mismatches)


In [17]:
def generateSummary(chrval):
    
    filename = "resnet_mapped_results/" + chrval + "_resnet.sam"
    numMapped = printNumMapped(filename)
    numBases, numMatches, numMismatches = summarize_penalties(filename)

    print (numMapped)
    return (numMapped, numBases, numMatches, numMismatches)


In [27]:
mappedSum = 0
totalBases = 0
totalMatches = 0
totalMismatches = 0

mapped, bases, matched, mismatched = generateSummary("chr1")
mappedSum += mapped; totalBases += bases; totalMatches += matched; totalMismatches += mismatched

generateSummary("chr2")
mappedSum += mapped; totalBases += bases; totalMatches += matched; totalMismatches += mismatched
generateSummary("chr3")
mappedSum += mapped; totalBases += bases; totalMatches += matched; totalMismatches += mismatched
generateSummary("chr4")
mappedSum += mapped; totalBases += bases; totalMatches += matched; totalMismatches += mismatched



print ("Final Sums:")
print (mappedSum)
print (totalBases)
print (totalMatches)
print (totalMismatches)

resnet_mapped_results/chr1_resnet.sam
3907
987
987

Mismatch Rate: 0.05905077501548554
Insertion Rate: 0.010151082384109198
Deletion Rate: 0.05584826535181907

Num Matches: 2050882
Total Bases: 3927857
Match Rate: 0.5221376440130076

Match Rate V2: 0.8067792236299601
987
resnet_mapped_results/chr2_resnet.sam
3899
993
993

Mismatch Rate: 0.06454662115877607
Insertion Rate: 0.0106652886669997
Deletion Rate: 0.058480979715884784

Num Matches: 2205329
Total Bases: 4011612
Match Rate: 0.5497363653314428

Match Rate V2: 0.8043793288209704
993
resnet_mapped_results/chr3_resnet.sam
3940
934
934

Mismatch Rate: 0.061069044958501205
Insertion Rate: 0.01009935022356606
Deletion Rate: 0.05528684163977301

Num Matches: 1946893
Total Bases: 3742518
Match Rate: 0.5202093884384791

Match Rate V2: 0.8044500473936782
934
resnet_mapped_results/chr4_resnet.sam
3947
824
824

Mismatch Rate: 0.05879058342354297
Insertion Rate: 0.010436400024443902
Deletion Rate: 0.05416070119076719

Num Matches: 1887208
Tota

In [31]:
print ("Match Rate: ")
print (8203528 / 10168244.0)

Match Rate: 
0.8067792236299601


In [32]:
print ("Mismatch Rate: ")
print (927772 / 10168244.0)

Mismatch Rate: 
0.09124210630665433


In [33]:
print ("Indel Rate")
print ((10168244 - 8203528 - 927772) / 10168244.0)

Indel Rate
0.10197867006338558
