In [1]:
# Confirm the number of cigars vs samtools mapped reads value (987)

def printNumMapped(filename):

    inputfile = open(filename, "r")


    unmapped_count = 0
    mapped_count = 0
    cs_count = 0

    for line in inputfile:

        # Header lines start with '@'
        if line[0] is "@":
            continue

        fields = line.split()

        cigar = fields[5]


        if fields[5] is "*": 
            unmapped_count += 1
        else: 
            mapped_count += 1
            #if len(fields) > 20:
                #print (fields[20])

        i = 10
        while i < len(fields):
            entry = fields[i]
            if "cs:Z:" in entry: 
                cs_count += 1
                #print (entry)
            i += 1

    print (filename)
    print (unmapped_count)
    print (mapped_count)
    print (cs_count)
    print ("")
    
    return mapped_count

In [2]:
import re

def count_penalties(cs_string):
    mismatches = 0
    insertions = 0
    deletions = 0
    matches = 0
    
    fields = re.split('(\W)', cs_string)
    
    i = 1
    while i < len(fields):
        
        penalty_type = fields[i]
        penalty_info = fields[i+1]
        
        # Mismatch
        if penalty_type is "*":
            mismatches += 1
        # Insertion
        elif penalty_type is "+":
            insertions += 1
        # Deletion
        elif penalty_type is "-":
            deletions += 1
        # Match
        elif penalty_type == ":":
            matches += int(penalty_info)
            #print (penalty_info)
        
        i += 2
        
    return mismatches, insertions, deletions, matches
    

In [3]:
def summarize_penalties(filename):

    inputfile = open(filename, "r")

    total_bases = 0
    num_mismatches = 0
    num_insertions = 0
    num_deletions = 0
    num_matches = 0

    for line in inputfile:

        # Header lines start with '@'
        if line[0] is "@":
            continue

        fields = line.split()
        seq = fields[9]

        # Find cs string, if present
        # If not present, the read was not mapped
        cs_string = "None"
        i = 10
        while i < len(fields):
            entry = fields[i]
            if "cs:Z:" in entry: 
                cs_string = entry
                break
            i += 1

        # If read is mapped, count bases and penalties
        if cs_string is not "None":
            total_bases += len(seq)

            accuracy_info = cs_string.split("cs:Z:")[1]

            mis, ins, delet, match = count_penalties(accuracy_info)

            num_mismatches += mis
            num_insertions += ins
            num_deletions += delet
            num_matches += match



    # Error Rates
    print ("Mismatch Rate: " + str(float(num_mismatches)/float(total_bases)))
    print ("Insertion Rate: " + str(float(num_insertions)/float(total_bases)))
    print ("Deletion Rate: " + str(float(num_deletions)/float(total_bases)))

    # This considers the length of insertions and deletions
    print ()
    print ("Num Matches: " + str(num_matches))
    print ("Total Bases: " + str(total_bases))
    print ("Match Rate: " + str(float(num_matches)/float(total_bases)))

    # This considers just the amount of insertions and deletions
    all_types = num_deletions + num_insertions + num_mismatches + num_matches
    match_rate_v2 = num_matches / float(all_types)
    print ()
    print ("Match Rate V2: " + str(match_rate_v2))
    
    return (all_types, num_matches, num_mismatches)


In [5]:
def generateSummary(chrval):
    
    filename = "resnext_card8_mapped_results/" + chrval + "_resnet.sam"
    numMapped = printNumMapped(filename)
    numBases, numMatches, numMismatches = summarize_penalties(filename)

    print (numMapped)
    return (numMapped, numBases, numMatches, numMismatches)


In [6]:
mappedSum = 0
totalBases = 0
totalMatches = 0
totalMismatches = 0

mapped, bases, matched, mismatched = generateSummary("chr1")
mappedSum += mapped; totalBases += bases; totalMatches += matched; totalMismatches += mismatched

generateSummary("chr2")
mappedSum += mapped; totalBases += bases; totalMatches += matched; totalMismatches += mismatched
generateSummary("chr3")
mappedSum += mapped; totalBases += bases; totalMatches += matched; totalMismatches += mismatched
generateSummary("chr4")
mappedSum += mapped; totalBases += bases; totalMatches += matched; totalMismatches += mismatched



print ("Final Sums:")
print (mappedSum)
print (totalBases)
print (totalMatches)
print (totalMismatches)

resnext_card8_mapped_results/chr1_resnet.sam
4743
90
90

Mismatch Rate: 0.06112205266456694
Insertion Rate: 0.014587523831752417
Deletion Rate: 0.05414495474917327

Num Matches: 357935
Total Bases: 762638
Match Rate: 0.46933800833422934

Match Rate V2: 0.7832841321145728
90
resnext_card8_mapped_results/chr2_resnet.sam
4725
151
151

Mismatch Rate: 0.05550245859238134
Insertion Rate: 0.013137171570710732
Deletion Rate: 0.049202616088443944

Num Matches: 450108
Total Bases: 1040026
Match Rate: 0.4327853342127985

Match Rate V2: 0.7859855727674198
151
resnext_card8_mapped_results/chr3_resnet.sam
4749
95
95

Mismatch Rate: 0.06186150903831868
Insertion Rate: 0.013822226202957803
Deletion Rate: 0.05749118823200898

Num Matches: 372696
Total Bases: 759212
Match Rate: 0.4908984578747438

Match Rate V2: 0.7866037433200226
95
resnext_card8_mapped_results/chr4_resnet.sam
4752
78
78

Mismatch Rate: 0.06512429666410087
Insertion Rate: 0.014257797677771876
Deletion Rate: 0.0614260236993029

Num Matc

In [10]:
print ("Match rate")
1431740/1827868

Match rate


0.7832841321145728

In [11]:
print ("Mismatch rate")
186456/1827868

Mismatch rate


0.1020073659585922

In [12]:
print ("Indel rate")
(1827868 - 186456 - 1431740)/1827868

Indel rate


0.11470850192683497