Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Chop #188

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ jobs:

- name: Pull odgi container
run: |
docker pull quay.io/biocontainers/odgi:0.8.3--py310h6cc9453_0
docker tag quay.io/biocontainers/odgi:0.8.3--py310h6cc9453_0 odgi
docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
susan-garry marked this conversation as resolved.
Show resolved Hide resolved
- name: Install odgi alias
run: |
mkdir -p $HOME/.local/bin
Expand Down Expand Up @@ -76,6 +76,32 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rust-lang/setup-rust-toolchain@v1

# Install slow-odgi.
- uses: actions/cache@v4
id: cache-uv
with:
path: ~/.cache/uv
key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv
- name: Create and activate virtualenv with uv
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
uv venv
echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
echo "$PWD/.venv/bin" >> $GITHUB_PATH
- name: Install Python tools
run: uv pip install -r requirements.txt

# Install odgi
- name: Pull odgi container
run: |
docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
- name: Install odgi alias
run: |
mkdir -p $HOME/.local/bin
cp .github/odgi.sh $HOME/.local/bin/odgi
chmod a+x $HOME/.local/bin/odgi

# Install Turnt.
- uses: actions/setup-python@v5
with:
Expand Down
17 changes: 16 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,29 @@ endif
tests/%.gfa:
curl -Lo ./$@ $(GFA_URL)/$*.gfa

tests/%.og: tests/%.gfa
odgi build -g $< -o $@

.PHONY: fetch
fetch: $(TEST_FILES:%=tests/%.gfa)

fetch-og: $(TEST_FILES:%=tests/%.og)

.PHONY: test-slow-odgi
test-slow-odgi: fetch
make -C slow_odgi test

.PHONY: test-flatgfa
test-flatgfa:
test-flatgfa: fetch
cd flatgfa ; cargo build

turnt -e flatgfa_mem -e flatgfa_file -e flatgfa_file_inplace tests/*.gfa

-turnt --save -v -e chop_oracle_fgfa tests/*.gfa
turnt -v -e flatgfa_chop tests/*.gfa

-turnt --save -v -e odgi_depth tests/*.gfa
turnt -v -e flatgfa_depth tests/*.gfa

clean:
-rm tests/*.flatgfa tests/*.inplace.flatgfa tests/*.chop tests/*.depth tests/*.gfa tests/*.og
3 changes: 1 addition & 2 deletions bench/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def hyperfine(cmds):
"hyperfine",
"--export-json",
tmp.name,
"--shell=none",
"--warmup=1",
"--min-runs=3",
"--max-runs=16",
Expand Down Expand Up @@ -223,7 +222,7 @@ def run_bench(graph_set, mode, tools, out_csv):
for tool in tools:
assert tool in runner.config["modes"][mode]["cmd"], f"unknown tool {tool}"
else:
# Use all supported tools by deefault.
# Use all supported tools by default.
tools = list(runner.config["modes"][mode]["cmd"].keys())

# Fetch all the graphs and convert them to both odgi and FlatGFA.
Expand Down
9 changes: 7 additions & 2 deletions bench/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,14 @@ convert = false
cmd.flatgfa = '{fgfa} -I {files[gfa]}'
cmd.slow_odgi = '{slow_odgi} norm {files[gfa]}'
cmd.odgi = '{odgi} view -g -i {files[gfa]}'
cmd.gfatools = "{gfatools} view {files[gfa]}"
cmd.gfatools = '{gfatools} view {files[gfa]}'

[modes.depth]
cmd.flatgfa = '{fgfa} -i {files[flatgfa]} depth'
cmd.odgi = '{odgi} depth -i {files[og]} -d'
cmd.slow_odgi = '{slow_odgi} depth {files[gfa]}'
cmd.slow_odgi = '{slow_odgi} depth {files[gfa]}'

[modes.chop]
cmd.flatgfa = '{fgfa} -i {files[flatgfa]} chop -c 3'
cmd.odgi = '{odgi} chop -i {files[og]} -c 3 -o -'
cmd.slow_odgi = '{slow_odgi} chop {files[gfa]} -n 3'
181 changes: 178 additions & 3 deletions flatgfa/src/cmds.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::flatgfa::{self, Handle, Segment};
use crate::pool::{self, Id, Store};
use crate::flatgfa::{self, Handle, Segment, Path, Orientation, Link};
use crate::pool::{self, Id, Span, Store};
use crate::{GFAStore, HeapFamily};
use argh::FromArgs;
use std::collections::{HashMap, HashSet};

Expand Down Expand Up @@ -201,7 +202,7 @@ impl<'a> SubgraphBuilder<'a> {

/// Add a single subpath from the given path to the subgraph.
fn include_subpath(&mut self, path: &flatgfa::Path, start: &SubpathStart, end_pos: usize) {
let steps = pool::Span::new(start.step, self.store.steps.next_id());
let steps = pool::Span::new(start.step, self.store.steps.next_id()); // why the next id?
let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos);
self.store
.add_path(name.as_bytes(), steps, std::iter::empty());
Expand Down Expand Up @@ -245,6 +246,7 @@ impl<'a> SubgraphBuilder<'a> {

/// Translate a handle from the source graph to this subgraph.
fn tr_handle(&self, old_handle: flatgfa::Handle) -> flatgfa::Handle {
// TODO: is this just generating the handle or should we add it to the new graph?
flatgfa::Handle::new(self.seg_map[&old_handle.segment()], old_handle.orient())
}

Expand Down Expand Up @@ -318,3 +320,176 @@ pub fn depth(gfa: &flatgfa::FlatGFA) {
);
}
}


/// chop the segments in a graph into sizes of N or smaller
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand, name = "chop")]
pub struct Chop {
/// maximimum segment size.
// Use c in keeping with odgi convention
#[argh(option, short = 'c')]
c: usize,

/// compute new links
#[argh(switch, short = 'l')]
l: bool,
}

/// Chop a graph into segments of size no larger than c
/// By default, compact node ids
/// CIGAR strings, links, and optional Segment data are invalidated by chop
/// Generates a new graph, rather than modifying the old one in place
pub fn chop<'a>(
gfa: &'a flatgfa::FlatGFA<'a>,
args: Chop,
) -> Result<flatgfa::HeapGFAStore, &'static str> {

let mut flat = flatgfa::HeapGFAStore::default();

// when segment S is chopped into segments S1 through S2 (exclusive),
// seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1
let mut seg_map: Vec<Span<Segment>> = Vec::new();
// The smallest id (>0) which does not already belong to a segment in `flat`
let mut max_node_id = 1;

fn link_forward(flat: &mut GFAStore<'static, HeapFamily>, span: &Span<Segment>) {
// Link segments spanned by `span` from head to tail
let overlap = Span::new_empty();
flat.add_links(
(span.start.index()..span.end.index()-1).map(|idx| {
Link {
from: Handle::new(Id::new(idx), Orientation::Forward),
to: Handle::new(Id::new(idx+1), Orientation::Forward),
overlap
}
})
);
}

// Add new, chopped segments
for seg in gfa.segs.all().iter() {
let len = seg.len();
if len <= args.c {
// Leave the segment as is
let id = flat.segs.add(Segment {
name: max_node_id,
seq: seg.seq,
optional: Span::new_empty()
// TODO: Optional data may stay valid when seg not chopped?
});
max_node_id += 1;
seg_map.push(Span::new(id, flat.segs.next_id()));
}
else {
let seq_end = seg.seq.end;
let mut offset = seg.seq.start.index();
let segs_start = flat.segs.next_id();
// Could also generate end_id by setting it equal to the start_id and
// updating it for each segment that is added - only benefits us if we
// don't unroll the last iteration of this loop
while offset < seq_end.index() - args.c {
// Generate a new segment of length c
flat.segs.add(Segment {
name: max_node_id,
seq: Span::new(Id::new(offset), Id::new(offset + args.c)),
optional: Span::new_empty()
});
offset += args.c;
max_node_id += 1;
}
// Generate the last segment
flat.segs.add(Segment {
name: max_node_id,
seq: Span::new(Id::new(offset), seq_end),
optional: Span::new_empty()
});
max_node_id += 1;
let new_seg_span = Span::new(segs_start, flat.segs.next_id());
seg_map.push(new_seg_span);
if args.l {
link_forward(&mut flat, &new_seg_span);
}
}
}

// For each path, add updated handles. Then add the updated path
for path in gfa.paths.all().iter() {
let path_start = flat.steps.next_id();
let mut path_end = flat.steps.next_id();
// Generate the new handles
// Tentative to-do: see if it is faster to read Id from segs than to re-generate it?
for step in gfa.get_path_steps(path) {
let range = {
let span = seg_map[step.segment().index()];
std::ops::Range::from(span)
};
match step.orient() {
Orientation::Forward => {
// In this builder, Id.index() == seg.name - 1 for all seg
path_end = flat.add_steps(
range.map(|idx| {
Handle::new(
Id::new(idx),
Orientation::Forward
)
})
).end;
},
Orientation::Backward => {
path_end = flat.add_steps(
range.rev().map(|idx| {
Handle::new(
Id::new(idx),
Orientation::Backward
)
})
).end;
}
}
}

// Add the updated path
flat.paths.add(Path{
name: path.name,
steps: Span::new(path_start, path_end),
overlaps: Span::new_empty()
});
}

// If the 'l' flag is specified, compute the links in the new graph
if args.l {
// For each link in the old graph, from handle A -> B:
// Add a link from
// (A.forward ? (A.end, forward) : (A.begin, backwards))
// -> (B.forward ? (B.begin, forward) : (B.end ? backwards))

for link in gfa.links.all().iter() {
let new_from = {
let old_from = link.from;
let chopped_segs = seg_map[old_from.segment().index()];
let seg_id = match old_from.orient() {
Orientation::Forward => chopped_segs.end - 1,
Orientation::Backward => chopped_segs.start
};
Handle::new(seg_id, old_from.orient())
};
let new_to = {
let old_to = link.to;
let chopped_segs = seg_map[old_to.segment().index()];
let seg_id = match old_to.orient() {
Orientation::Forward => chopped_segs.start,
Orientation::Backward => chopped_segs.end - 1,
};
Handle::new(seg_id, old_to.orient())
};
flat.add_link(
new_from,
new_to,
vec![]
);
}
}

Ok(flat)
}
19 changes: 17 additions & 2 deletions flatgfa/src/flatgfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ pub struct Path {
pub overlaps: Span<Span<AlignOp>>,
}

impl Path {
pub fn step_count(&self) -> usize {
self.steps.end.index() - self.steps.start.index()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think self.steps.len() does this. (In fact, maybe we don't need path.step_count() if path.steps.len() is one character shorter? 😃)

}
}

/// An allowed edge between two oriented segments.
#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)]
#[repr(packed)]
Expand Down Expand Up @@ -265,6 +271,10 @@ impl<'a> FlatGFA<'a> {
self.name_data[path.name].as_ref()
}

pub fn get_path_steps(&self, path: &Path) -> impl Iterator<Item = &Handle> {
self.steps[path.steps].iter()
}

/// Get a handle's associated segment.
pub fn get_handle_seg(&self, handle: Handle) -> &Segment {
&self.segs[handle.segment()]
Expand All @@ -277,8 +287,8 @@ impl<'a> FlatGFA<'a> {

/// Look up a CIGAR alignment.
pub fn get_alignment(&self, overlap: Span<AlignOp>) -> Alignment {
Alignment {
ops: &self.alignment[overlap],
Alignment {
ops: &self.alignment[overlap]
Comment on lines +290 to +291
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please run cargo fmt? It should get rid of formatting "noise" changes like this one.

}
}

Expand Down Expand Up @@ -353,6 +363,11 @@ impl<'a, P: StoreFamily<'a>> GFAStore<'a, P> {
self.steps.add(step)
}

/// Add a sequence of links.
pub fn add_links(&mut self, links: impl Iterator<Item = Link>) -> Span<Link> {
self.links.add_iter(links)
}

/// Add a link between two (oriented) segments.
pub fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec<AlignOp>) -> Id<Link> {
self.links.add(Link {
Expand Down
Loading
Loading