Skip to content

Commit

Permalink
Merge pull request #89 from dib-lab/feature/edge-sort-degree
Browse files Browse the repository at this point in the history
Process edges in order of adjacent node degrees, prune edges from nodes with low degree
  • Loading branch information
standage committed Jul 7, 2017
2 parents 13384a2 + a2fc307 commit 81caff4
Show file tree
Hide file tree
Showing 24 changed files with 292 additions and 157 deletions.
49 changes: 40 additions & 9 deletions kevlar/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from collections import defaultdict, namedtuple
import itertools
import sys
import pandas
import networkx
import screed
import khmer
Expand Down Expand Up @@ -83,6 +84,7 @@ def fetch_largest_overlapping_pair(graph, reads):
Sort the edges using 3 criteria. The first is the primary criterion, the
other two ensure deterministic behavior.
- FIXME
- overlap (largest first)
- lexicographically smaller read name
- lexicographically larger read name
Expand All @@ -91,6 +93,7 @@ def fetch_largest_overlapping_pair(graph, reads):
graph.edges(),
reverse=True,
key=lambda e: (
sum(graph.degree([e[0], e[1]]).values()),
graph[e[0]][e[1]]['overlap'],
max(e),
min(e),
Expand Down Expand Up @@ -154,6 +157,28 @@ def assemble_with_greed(reads, kmers, graph, ccindex, debugout=None):
graph.remove_node(pair.head.name)


def prune_graph(graph, quant=0.1):
edge_adj_deg = list()
for edge in graph.edges_iter():
degree_dict = graph.degree([edge[0], edge[1]])
agg_degree = sum(degree_dict.values())
edge_adj_deg.append(agg_degree)

edges = pandas.DataFrame(edge_adj_deg)
threshold = edges[0].quantile(quant)
edges_to_drop = list() # Don't remove edges while iterating through them
for edge in graph.edges_iter():
degree_dict = graph.degree([edge[0], edge[1]])
agg_degree = sum(degree_dict.values())
if agg_degree < threshold:
edges_to_drop.append(edge)

for edge in edges_to_drop:
graph.remove_edge(edge[0], edge[1])

return len(edges_to_drop)


def main(args):
debugout = None
if args.debug:
Expand Down Expand Up @@ -185,17 +210,23 @@ def main(args):
message = '[kevlar::assemble] graph written to {}'.format(args.gml)
print(message, file=args.logfile)

num_ccs = networkx.number_connected_components(graph)
if num_ccs > 1:
message = 'assembly graph contains {:d}'.format(num_ccs)
message += ' connected components; designated in the output by cc=N'
edges_dropped = prune_graph(graph)
cc_stream = networkx.connected_component_subgraphs(graph)
ccs = [cc for cc in cc_stream if cc.number_of_edges() > 0]
ccnodes = sum([cc.number_of_nodes() for cc in ccs])
message = 'dropped {:d} edges'.format(edges_dropped)
message += ', graph now has {:d} connected component(s)'.format(len(ccs))
message += ', {:d} nodes'.format(ccnodes)
message += ', and {:d} edges'.format(graph.number_of_edges())
print('[kevlar::assemble]', message, file=args.logfile)
if len(ccs) > 1:
message = 'multiple connected components designated by cc=N in output'
print('[kevlar::assemble] WARNING:', message, file=args.logfile)

contigcount = 0
unassembledcount = 0
outstream = kevlar.open(args.out, 'w')
cc_stream = networkx.connected_component_subgraphs(graph)
for n, cc in enumerate(cc_stream, 1):
for n, cc in enumerate(ccs, 1):
assemble_with_greed(reads, kmers, cc, n, debugout)
for seqname in cc.nodes():
if seqname in inputreads:
Expand All @@ -205,9 +236,9 @@ def main(args):
contigrecord = reads[seqname]
kevlar.print_augmented_fastx(contigrecord, outstream)

assembledcount = len(inputreads) - unassembledcount
assembledcount = ccnodes - unassembledcount
message = '[kevlar::assemble] assembled'
message += ' {:d}/{:d} reads'.format(assembledcount, len(inputreads))
message += ' from {:d} connected component(s)'.format(num_ccs)
message += ' {:d}/{:d} reads'.format(assembledcount, ccnodes)
message += ' from {:d} connected component(s)'.format(len(ccs))
message += ' into {:d} contig(s)'.format(contigcount)
print(message, file=args.logfile)
2 changes: 1 addition & 1 deletion kevlar/cli/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def subparser(subparsers):
default=100, help='retrieve the genomic interval '
'from the reference by extending beyond the span '
'of all k-mer starting positions by D bp')
subparser.add_argument('-o', '--out', metavar='FILE',
subparser.add_argument('-o', '--out', metavar='FILE', default='-',
help='output file; default is terminal (stdout)')
subparser.add_argument('-k', '--ksize', type=int, metavar='K', default=31,
help='k-mer size; default is 31')
Expand Down
2 changes: 1 addition & 1 deletion kevlar/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def extract_region(refr, seqid, start, end):


def main(args):
output = args.out if args.out else sys.stdout
output = kevlar.open(args.out, 'w')
matchgen = get_exact_matches(args.contigs, args.refr, args.ksize)
kmer_matches = [m for m in matchgen]
if len(kmer_matches) == 0:
Expand Down
99 changes: 59 additions & 40 deletions notebook/human-sim-pico/HumanSimulationPico.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -191,29 +191,29 @@
"text": [
"[bwa_index] Pack FASTA... 0.03 sec\n",
"[bwa_index] Construct BWT for the packed sequence...\n",
"[bwa_index] 0.80 seconds elapse.\n",
"[bwa_index] 0.88 seconds elapse.\n",
"[bwa_index] Update BWT... 0.03 sec\n",
"[bwa_index] Pack forward-only FASTA... 0.02 sec\n",
"[bwa_index] Construct SA from BWT and Occ... 0.23 sec\n",
"[bwa_index] Construct SA from BWT and Occ... 0.25 sec\n",
"[main] Version: 0.7.15-r1140\n",
"[main] CMD: bwa index human.random.fa\n",
"[main] Real time: 1.131 sec; CPU: 1.114 sec\n",
"[main] Real time: 1.241 sec; CPU: 1.220 sec\n",
"\n",
"real\t0m1.144s\n",
"user\t0m1.077s\n",
"sys\t0m0.041s\n",
"real\t0m1.252s\n",
"user\t0m1.168s\n",
"sys\t0m0.054s\n",
"\n",
"real\t0m29.926s\n",
"user\t0m44.088s\n",
"sys\t0m0.666s\n",
"real\t0m30.560s\n",
"user\t0m44.665s\n",
"sys\t0m0.746s\n",
"\n",
"real\t0m30.516s\n",
"user\t0m45.150s\n",
"sys\t0m0.662s\n",
"real\t0m31.306s\n",
"user\t0m45.463s\n",
"sys\t0m0.726s\n",
"\n",
"real\t0m31.790s\n",
"user\t0m46.424s\n",
"sys\t0m0.661s\n"
"real\t0m31.313s\n",
"user\t0m45.860s\n",
"sys\t0m0.714s\n"
]
}
],
Expand Down Expand Up @@ -250,13 +250,13 @@
"[kevlar::counting] done loading reads; 295877 reads processed, 22485272 k-mers stored; estimated false positive rate is 0.008; saved to \"father.counttable\"\n",
"[kevlar::counting] loading sample from mother-reads-dump.fq.gz\n",
"[kevlar::counting] done loading reads; 295581 reads processed, 6951229 k-mers stored; estimated false positive rate is 0.180; saved to \"mother.counttable\"\n",
"[kevlar::count] 2 samples loaded in 79.09 sec\n",
"[kevlar::count] 2 samples loaded in 92.80 sec\n",
"[kevlar::count] Loading case samples\n",
"[kevlar::counting] computing k-mer abundances for 1 samples\n",
"[kevlar::counting] loading sample from proband-reads-dump.fq.gz\n",
"[kevlar::counting] done loading reads; 296034 reads processed, 6617158 k-mers stored; estimated false positive rate is 0.172; saved to \"proband.counttable\"\n",
"[kevlar::count] 1 sample(s) loaded in 49.21 sec\n",
"[kevlar::count] Total time: 128.30 seconds\n"
"[kevlar::count] 1 sample(s) loaded in 66.25 sec\n",
"[kevlar::count] Total time: 159.05 seconds\n"
]
}
],
Expand Down Expand Up @@ -294,16 +294,16 @@
"[kevlar::novel] INFO: counttables for 2 sample(s) provided, any corresponding FASTA/FASTQ input will be ignored for computing k-mer abundances\n",
"[kevlar::counting] loading sketchfile \"mother.counttable\"...done! estimated false positive rate is 0.180\n",
"[kevlar::counting] loading sketchfile \"father.counttable\"...done! estimated false positive rate is 0.008\n",
"[kevlar::novel] Control samples loaded in 0.38 sec\n",
"[kevlar::novel] Control samples loaded in 0.47 sec\n",
"[kevlar::novel] Loading case samples\n",
"[kevlar::novel] INFO: counttables for 1 samples provided, any corresponding FASTA/FASTQ input will be ignored for computing k-mer abundances\n",
"[kevlar::counting] loading sketchfile \"proband.counttable\"...done! estimated false positive rate is 0.172\n",
"[kevlar::novel] Case samples loaded in 0.06 sec\n",
"[kevlar::novel] All samples loaded in 0.45 sec\n",
"[kevlar::novel] Case samples loaded in 0.09 sec\n",
"[kevlar::novel] All samples loaded in 0.56 sec\n",
"[kevlar::novel] Iterating over reads from 1 case sample(s)\n",
"[kevlar::novel] Iterated over 296033 reads in 51.97 seconds\n",
"[kevlar::novel] Iterated over 296033 reads in 62.63 seconds\n",
"[kevlar::novel] Found 22684 instances of 1125 unique novel kmers in 938 reads\n",
"[kevlar::novel] Total time: 52.42 seconds\n"
"[kevlar::novel] Total time: 63.19 seconds\n"
]
}
],
Expand Down Expand Up @@ -338,11 +338,11 @@
"text": [
"[kevlar::filter] Loading reference genome from human.random.fa\n",
" 1 sequences and 2499976 k-mers consumed; estimated false positive rate is 0.000\n",
"[kevlar::filter] Reference genome loaded in 2.07 sec\n",
"[kevlar::filter] Reference genome loaded in 2.52 sec\n",
"[kevlar::filter] Loading input; recalculate k-mer abundances, de-duplicate reads and merge k-mers\n",
" - proband.novel.augfastq.gz\n",
" 938 instances of 938 reads consumed, annotated with 22684 instances of 1125 distinct \"interesting\" k-mers; estimated false positive rate is 0.000\n",
"[kevlar::filter] Input loaded in 0.57 sec\n",
"[kevlar::filter] Input loaded in 0.73 sec\n",
"[kevlar::filter] Validate k-mers and print reads\n",
" processed 22684 instances of 1125 distinct \"interesting\" k-mers in 938 reads\n",
" 838 instances of 95 distinct k-mers masked by the reference genome\n",
Expand All @@ -351,8 +351,8 @@
" 407 reads with no surviving valid k-mers ignored\n",
" 0 contaminant reads discarded\n",
" 531 reads written to output\n",
"[kevlar::filter] k-mers validated and reads printed in 1.30 sec\n",
"[kevlar::filter] Total time: 3.93 seconds\n"
"[kevlar::filter] k-mers validated and reads printed in 1.59 sec\n",
"[kevlar::filter] Total time: 4.84 seconds\n"
]
}
],
Expand Down Expand Up @@ -386,13 +386,13 @@
"output_type": "stream",
"text": [
"[kevlar::partition] Loading reads from proband.novel.filtered.augfastq.gz\n",
"[kevlar::partition] Reads loaded in 0.49 sec\n",
"[kevlar::partition] Reads loaded in 0.70 sec\n",
"[kevlar::partition] Building read graph in relaxed mode (shared novel k-mer required)\n",
"[kevlar::partition] Graph built in 0.85 sec\n",
"[kevlar::partition] Graph built in 1.09 sec\n",
"[kevlar::partition] Writing output to prefix part\n",
"[kevlar::overlap] grouped 531 reads into 10 connected components\n",
"[kevlar::partition] Output written in 0.57 sec\n",
"[kevlar::partition] Total time: 1.42 seconds\n"
"[kevlar::partition] Total time: 1.66 seconds\n"
]
}
],
Expand Down Expand Up @@ -428,61 +428,71 @@
"==== iter 0 ====\n",
"[kevlar::assemble] loaded 129 reads and 414 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 126 nodes and 1477 edges\n",
"[kevlar::assemble] assembled 105/129 reads from 1 connected component(s) into 10 contig(s)\n",
"[kevlar::assemble] dropped 146 edges, graph now has 1 connected component(s), 111 nodes, and 1331 edges\n",
"[kevlar::assemble] assembled 94/111 reads from 1 connected component(s) into 7 contig(s)\n",
"\n",
"\n",
"==== iter 1 ====\n",
"[kevlar::assemble] loaded 77 reads and 182 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 75 nodes and 971 edges\n",
"[kevlar::assemble] assembled 65/77 reads from 1 connected component(s) into 7 contig(s)\n",
"[kevlar::assemble] dropped 90 edges, graph now has 1 connected component(s), 69 nodes, and 881 edges\n",
"[kevlar::assemble] assembled 53/69 reads from 1 connected component(s) into 4 contig(s)\n",
"\n",
"\n",
"==== iter 2 ====\n",
"[kevlar::assemble] loaded 66 reads and 175 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 62 nodes and 600 edges\n",
"[kevlar::assemble] assembled 56/66 reads from 1 connected component(s) into 5 contig(s)\n",
"[kevlar::assemble] dropped 59 edges, graph now has 1 connected component(s), 56 nodes, and 541 edges\n",
"[kevlar::assemble] assembled 49/56 reads from 1 connected component(s) into 2 contig(s)\n",
"\n",
"\n",
"==== iter 3 ====\n",
"[kevlar::assemble] loaded 42 reads and 85 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 42 nodes and 541 edges\n",
"[kevlar::assemble] assembled 37/42 reads from 1 connected component(s) into 1 contig(s)\n",
"[kevlar::assemble] dropped 46 edges, graph now has 1 connected component(s), 39 nodes, and 495 edges\n",
"[kevlar::assemble] assembled 37/39 reads from 1 connected component(s) into 1 contig(s)\n",
"\n",
"\n",
"==== iter 4 ====\n",
"[kevlar::assemble] loaded 26 reads and 25 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 26 nodes and 225 edges\n",
"[kevlar::assemble] assembled 21/26 reads from 1 connected component(s) into 1 contig(s)\n",
"[kevlar::assemble] dropped 21 edges, graph now has 1 connected component(s), 21 nodes, and 204 edges\n",
"[kevlar::assemble] assembled 21/21 reads from 1 connected component(s) into 1 contig(s)\n",
"\n",
"\n",
"==== iter 5 ====\n",
"[kevlar::assemble] loaded 21 reads and 22 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 21 nodes and 175 edges\n",
"[kevlar::assemble] assembled 19/21 reads from 1 connected component(s) into 2 contig(s)\n",
"[kevlar::assemble] dropped 10 edges, graph now has 1 connected component(s), 19 nodes, and 165 edges\n",
"[kevlar::assemble] assembled 18/19 reads from 1 connected component(s) into 1 contig(s)\n",
"\n",
"\n",
"==== iter 6 ====\n",
"[kevlar::assemble] loaded 21 reads and 24 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 19 nodes and 100 edges\n",
"[kevlar::assemble] assembled 16/21 reads from 1 connected component(s) into 1 contig(s)\n",
"[kevlar::assemble] dropped 8 edges, graph now has 1 connected component(s), 15 nodes, and 92 edges\n",
"[kevlar::assemble] assembled 14/15 reads from 1 connected component(s) into 1 contig(s)\n",
"\n",
"\n",
"==== iter 7 ====\n",
"[kevlar::assemble] loaded 20 reads and 24 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 18 nodes and 117 edges\n",
"[kevlar::assemble] assembled 18/20 reads from 1 connected component(s) into 2 contig(s)\n",
"[kevlar::assemble] dropped 10 edges, graph now has 1 connected component(s), 16 nodes, and 107 edges\n",
"[kevlar::assemble] assembled 15/16 reads from 1 connected component(s) into 1 contig(s)\n",
"\n",
"\n",
"==== iter 8 ====\n",
"[kevlar::assemble] loaded 16 reads and 25 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 15 nodes and 88 edges\n",
"[kevlar::assemble] assembled 15/16 reads from 1 connected component(s) into 2 contig(s)\n",
"[kevlar::assemble] dropped 7 edges, graph now has 1 connected component(s), 14 nodes, and 81 edges\n",
"[kevlar::assemble] assembled 12/14 reads from 1 connected component(s) into 1 contig(s)\n",
"\n",
"\n",
"==== iter 9 ====\n",
"[kevlar::assemble] loaded 16 reads and 53 interesting k-mers\n",
"[kevlar::assemble] initialized \"shared interesting k-mers\" graph with 15 nodes and 68 edges\n",
"[kevlar::assemble] assembled 13/16 reads from 1 connected component(s) into 1 contig(s)\n"
"[kevlar::assemble] dropped 6 edges, graph now has 1 connected component(s), 14 nodes, and 62 edges\n",
"[kevlar::assemble] assembled 12/14 reads from 1 connected component(s) into 1 contig(s)\n"
]
}
],
Expand Down Expand Up @@ -512,6 +522,15 @@
" cmd = \"gunzip -c part.cc{i}.asmbl.augfasta.gz | grep -v '#$' > part.cc{i}.fa\".format(i=i)\n",
" subprocess.check_call(cmd, shell=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 81caff4

Please sign in to comment.