-
Notifications
You must be signed in to change notification settings - Fork 31
Bump utils-java dependency. #206
Changes from all commits
461631e
6830ef0
c1227e9
29d41c3
72c9d7d
adaec25
5bee359
70deb62
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,6 @@ | |
import com.google.cloud.genomics.utils.grpc.VariantEmitterStrategy; | ||
import com.google.cloud.genomics.utils.grpc.VariantMergeStrategy; | ||
import com.google.cloud.genomics.utils.grpc.VariantStreamIterator; | ||
import com.google.cloud.genomics.utils.grpc.VariantUtils; | ||
import com.google.common.base.Preconditions; | ||
import com.google.common.collect.Iterables; | ||
import com.google.genomics.v1.StreamVariantsRequest; | ||
|
@@ -105,7 +104,6 @@ public PCollection<Variant> apply(PCollection<Variant> input) { | |
return input | ||
.apply(ParDo.of(new BinVariantsFn())) | ||
.apply(GroupByKey.<KV<String, Long>, Variant>create()) | ||
.apply(ParDo.of(new RetrieveWindowOfVariantsFn())) | ||
.apply(ParDo.of(new CombineVariantsFn())); | ||
} | ||
|
||
|
@@ -125,34 +123,23 @@ public static final long getEndBin(int binSize, Variant variant) { | |
public void processElement(ProcessContext context) { | ||
Options options = | ||
context.getPipelineOptions().as(Options.class); | ||
int binSize = options.getBinSize(); | ||
Variant variant = context.element(); | ||
long startBin = getStartBin(options.getBinSize(), variant); | ||
long endBin = | ||
VariantUtils.IS_NON_VARIANT_SEGMENT.apply(variant) ? getEndBin(options.getBinSize(), | ||
variant) : startBin; | ||
for (long bin = startBin; bin <= endBin; bin++) { | ||
context.output(KV.of(KV.of(variant.getReferenceName(), bin), variant)); | ||
} | ||
} | ||
} | ||
|
||
static final class RetrieveWindowOfVariantsFn extends | ||
DoFn<KV<KV<String, Long>, Iterable<Variant>>, Iterable<Variant>> { | ||
|
||
@Override | ||
public void processElement(ProcessContext context) { | ||
|
||
// The upper bound on number of variants in the iterable is dependent upon the binSize | ||
// used in the prior step to construct the key. | ||
KV<KV<String, Long>, Iterable<Variant>> kv = context.element(); | ||
context.output(kv.getValue()); | ||
long startBin = getStartBin(binSize, variant); | ||
long endBin = getEndBin(binSize, variant); | ||
for (long bin = startBin; bin <= endBin; bin++) { | ||
context.output(KV.of(KV.of(variant.getReferenceName(), bin * binSize), variant)); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Use this transform when working with a collection of sites across the genome. | ||
* | ||
* It passes the data onto the next step retaining the ordering imposed by the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/ordering/order |
||
* Google Genomics API which is sorted by (variantset id, contig, start pos, variant id). | ||
* | ||
* The amount of RAM needed during the combine step is controlled by the number of | ||
* base pairs between the start and end position of each site. | ||
* Compared to the BinShuffleAndCombineTransform, this transform has: | ||
|
@@ -183,7 +170,7 @@ public PCollection<Variant> apply(PCollection<StreamVariantsRequest> input) { | |
.apply(ParDo.of(new CombineVariantsFn())); | ||
} | ||
|
||
static final class RetrieveFn extends DoFn<StreamVariantsRequest, Iterable<Variant>> { | ||
public static final class RetrieveFn extends DoFn<StreamVariantsRequest, KV<KV<String, Long>, Iterable<Variant>>> { | ||
private final OfflineAuth auth; | ||
private String fields; | ||
|
||
|
@@ -194,44 +181,55 @@ public RetrieveFn(OfflineAuth auth, String fields) { | |
} | ||
|
||
@Override | ||
public void processElement(DoFn<StreamVariantsRequest, Iterable<Variant>>.ProcessContext context) | ||
public void processElement(DoFn<StreamVariantsRequest, KV<KV<String, Long>, Iterable<Variant>>>.ProcessContext context) | ||
throws Exception { | ||
StreamVariantsRequest request = context.element(); | ||
|
||
Iterator<StreamVariantsResponse> iter = VariantStreamIterator.enforceShardBoundary(auth, context.element(), | ||
ShardBoundary.Requirement.NON_VARIANT_OVERLAPS, fields); | ||
Iterator<StreamVariantsResponse> iter = VariantStreamIterator.enforceShardBoundary(auth, request, | ||
ShardBoundary.Requirement.OVERLAPS, fields); | ||
|
||
if (iter.hasNext()) { | ||
// We do have some data overlapping this site. | ||
List<Iterable<Variant>> allVariantsForRequest = new ArrayList<>(); | ||
while (iter.hasNext()) { | ||
allVariantsForRequest.add(iter.next().getVariantsList()); | ||
} | ||
context.output(Iterables.concat(allVariantsForRequest)); | ||
context.output(KV.of(KV.of(request.getReferenceName(), request.getStart()), Iterables.concat(allVariantsForRequest))); | ||
} | ||
} | ||
} | ||
} | ||
|
||
public static final class CombineVariantsFn extends DoFn<Iterable<Variant>, Variant> { | ||
/** | ||
* Pass a window of variants from Dataflow to the merge strategy implementation. | ||
* | ||
* See {@link VariantMergeStrategy} for more detail. | ||
*/ | ||
public static final class CombineVariantsFn extends DoFn<KV<KV<String, Long>, Iterable<Variant>>, Variant> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the extra When adding the JavaDoc, it would be nice to say more than that the reference chromosome and start position is required, but more along the lines of why and how it would be used. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question! Its covered in the JavaDoc for the merge strategy. I added a JavaDoc here to direct readers there. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for adding the documentation reference, though there is still some small confusion that I commented at the following post: Thanks, |
||
private VariantMergeStrategy merger; | ||
|
||
@Override | ||
public void startBundle(DoFn<Iterable<Variant>, Variant>.Context c) throws Exception { | ||
public void startBundle(DoFn<KV<KV<String, Long>, Iterable<Variant>>, Variant>.Context c) throws Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above please regarding JavaDoc and reasoning. |
||
super.startBundle(c); | ||
Options options = c.getPipelineOptions().as(Options.class); | ||
merger = options.getVariantMergeStrategy().newInstance(); | ||
} | ||
|
||
@Override | ||
public void processElement(ProcessContext context) throws Exception { | ||
merger.merge(context.element(), new DataflowVariantEmitter(context)); | ||
merger.merge(context.element().getKey().getValue(), context.element().getValue(), new DataflowVariantEmitter(context)); | ||
} | ||
} | ||
|
||
/** | ||
* Emit a merged variant to the Dataflow pipeline. | ||
* | ||
* See {@link VariantEmitterStrategy} for more detail. | ||
*/ | ||
public static class DataflowVariantEmitter implements VariantEmitterStrategy { | ||
private final DoFn<Iterable<Variant>, Variant>.ProcessContext context; | ||
private final DoFn<KV<KV<String, Long>, Iterable<Variant>>, Variant>.ProcessContext context; | ||
|
||
public DataflowVariantEmitter(DoFn<Iterable<Variant>, Variant>.ProcessContext context) { | ||
public DataflowVariantEmitter(DoFn<KV<KV<String, Long>, Iterable<Variant>>, Variant>.ProcessContext context) { | ||
this.context = context; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -121,10 +121,10 @@ public static void main(String[] args) throws Exception { | |
new JoinNonVariantSegmentsWithVariants.RetrieveAndCombineTransform(auth, VARIANT_FIELDS)); | ||
} else { | ||
processedVariants = requests.apply( | ||
new VariantStreamer(auth, ShardBoundary.Requirement.NON_VARIANT_OVERLAPS, VARIANT_FIELDS)); | ||
new VariantStreamer(auth, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS)); | ||
} | ||
} else { | ||
// Computing IBS over genomic region(s) or the whole genome. | ||
// Compute IBS over genomic region(s) or the whole genome. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Proposed substitution for the whole sentence: Computes IBS over one or more genomic regions, or the entire genome. Feel free to use whole instead of entire if that is preferred. |
||
List<StreamVariantsRequest> requests = options.isAllReferences() ? | ||
ShardUtils.getVariantRequests(prototype, ShardUtils.SexChromosomeFilter.EXCLUDE_XY, | ||
options.getBasesPerShard(), auth) : | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -150,10 +150,10 @@ public void testIsOverlapping() { | |
|
||
@Test | ||
public void testCombineVariantsFn() { | ||
DoFnTester<Iterable<Variant>, Variant> fn = | ||
DoFnTester<KV<KV<String, Long>, Iterable<Variant>>, Variant> fn = | ||
DoFnTester.of(new JoinNonVariantSegmentsWithVariants.CombineVariantsFn()); | ||
|
||
Assert.assertThat(fn.processBatch(Arrays.asList(input)), | ||
Assert.assertThat(fn.processBatch(KV.of(KV.of("chr7", 200000L), (Iterable<Variant>) Arrays.asList(input))), | ||
CoreMatchers.hasItems(expectedSnp1, expectedSnp2, expectedInsert)); | ||
} | ||
|
||
|
@@ -163,14 +163,14 @@ public void testBinVariantsFn() { | |
DoFnTester.of(new JoinNonVariantSegmentsWithVariants.BinShuffleAndCombineTransform.BinVariantsFn()); | ||
|
||
List<KV<KV<String, Long>, Variant>> binVariantsOutput = binVariantsFn.processBatch(input); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200L), snp1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200L), snp2))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200L), insert))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 199L), blockRecord1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200L), blockRecord1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 201L), blockRecord1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 202L), blockRecord1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200L), blockRecord2))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200000L), snp1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200000L), snp2))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200000L), insert))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 199000L), blockRecord1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200000L), blockRecord1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 201000L), blockRecord1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 202000L), blockRecord1))); | ||
assertThat(binVariantsOutput, CoreMatchers.hasItem(KV.of(KV.of("chr7", 200000L), blockRecord2))); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why create a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need a compound key here, (chromosome, start of bin), and the compound key is used in a key value pair. So that is why we wind up with a nested KV. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now I'm confused. It looks like you never use the chromosome string (i.e. "chr7"), since your
And you are merging the variant block of records starting with a start position, and the rest based on the maximum value in that list. So why not just remove the chromosome reference and just write an offset, as your variants are sorted by position. It would be better to actually utilize the reference with an offset, or even better by a genomic region to intersect by. I recommend that the combine/filter strategy in Trust me on this - it will save you major headaches later on, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The form of the KV is consistent throughout JoinNonVariantSegmentsWithVariants. That code is binning variants based on chromosome and position. The variant merge strategy is different, it is handed a particular bin of variants to work on. It doesn't make sense to merge variants on different chromosomes so that's why chromosome isn't in the signature. But your point is well taken that it could use more clarity - thanks for this feedback and I will address it in a future PR! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the clarification, and that more documentation will be added with a future PR. Of course I agree it doesn't make sense to merge variants on different chromosomes, but to double-check that the reference is not included in the argument list of the called
What I am proposing are only simplifications to the way things are approached because they will not only be harder to debug later on, but more importantly expanded by users to increase the set of features available to this nice API, and other dependent APIs of Google Genomics. |
||
assertEquals(8, binVariantsOutput.size()); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like the latest version is
1.1
based on this:https://oss.sonatype.org/#nexus-search;quick~gatk-tools-java
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I'm not going to update this particular dependency at this time.