# Almeida 2k random eval


match_rank: order

In [1]:
# code to load contig reports
import glob
import csv
import os

def load_contig_reports(dirname):
    report_csvs = glob.glob(os.path.join(dirname, '*.contigs.csv'))
    d = {}
    for n, filename in enumerate(report_csvs):
        if n % 100 == 0:
            print(f'... loading {dirname} file {n} of {len(report_csvs)}')
        with open(filename, 'rt') as fp:
            for row in csv.DictReader(fp):
                key = (row['genomefile'], row['contig_name'])
                d[key] = row
    return d
    
almeida = load_contig_reports('../eval.output.almeida/')


... loading ../eval.output.almeida/ file 0 of 2000
... loading ../eval.output.almeida/ file 100 of 2000
... loading ../eval.output.almeida/ file 200 of 2000
... loading ../eval.output.almeida/ file 300 of 2000
... loading ../eval.output.almeida/ file 400 of 2000
... loading ../eval.output.almeida/ file 500 of 2000
... loading ../eval.output.almeida/ file 600 of 2000
... loading ../eval.output.almeida/ file 700 of 2000
... loading ../eval.output.almeida/ file 800 of 2000
... loading ../eval.output.almeida/ file 900 of 2000
... loading ../eval.output.almeida/ file 1000 of 2000
... loading ../eval.output.almeida/ file 1100 of 2000
... loading ../eval.output.almeida/ file 1200 of 2000
... loading ../eval.output.almeida/ file 1300 of 2000
... loading ../eval.output.almeida/ file 1400 of 2000
... loading ../eval.output.almeida/ file 1500 of 2000
... loading ../eval.output.almeida/ file 1600 of 2000
... loading ../eval.output.almeida/ file 1700 of 2000
... loading ../eval.output.almeida/ file

In [2]:
# calculate those with explicit "dirty" flags

removed = set()
for k, row in almeida.items():
    if row['decision'] == 'ContigInfo.DIRTY':
        removed.add(k)
print(f'removed {len(removed)} of {len(almeida)}')


removed 2947 of 246531


In [4]:
import collections

def count_contig_info(d):
    counts = collections.Counter()
    counts_bp = collections.Counter()

    for k, row in d.items():
        decision = row['decision']
        bp = int(row['bp'])
        counts[decision] += 1
        counts_bp[decision] += bp
        
    return counts, counts_bp

counts, counts_bp = count_contig_info(almeida)
print('contig info:')
for k, v in counts.most_common():
    print(f'{k} {counts[k]} contigs / {counts_bp[k]/1e6:.1f} Mbp')


contig info:
ContigInfo.CLEAN 223120 contigs / 4983.3 Mbp
ContigInfo.NO_IDENT 15797 contigs / 82.5 Mbp
ContigInfo.NO_HASH 4667 contigs / 13.1 Mbp
ContigInfo.DIRTY 2947 contigs / 22.2 Mbp


In [5]:
import collections

def count_reasons(d):
    counts = collections.Counter()
    counts_bp = collections.Counter()

    for k, row in d.items():
        if row['decision'] == 'ContigInfo.DIRTY':
            reason = int(row['reason'])
            assert reason > 0
            bp = int(row['bp'])
            counts[reason] += 1
            counts_bp[reason] += bp
        
    return counts, counts_bp

rcounts, rcounts_bp = count_reasons(almeida)
print('reasons:')
for k, v in rcounts.most_common():
    print(f'{k} {rcounts[k]} contigs / {rcounts_bp[k]/1e6:.1f} Mbp')


reasons:
3 1649 contigs / 10.2 Mbp
1 1010 contigs / 9.3 Mbp
2 288 contigs / 2.7 Mbp
