Skip to content
This repository has been archived by the owner on Sep 15, 2021. It is now read-only.

Shard readGroupSets using the associated referenceSet. #64

Merged
merged 5 commits into from
Oct 13, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 63 additions & 2 deletions src/main/java/com/google/cloud/genomics/utils/GenomicsUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,17 @@

import com.google.api.services.genomics.Genomics;
import com.google.api.services.genomics.model.CallSet;
import com.google.api.services.genomics.model.CoverageBucket;
import com.google.api.services.genomics.model.ListCoverageBucketsResponse;
import com.google.api.services.genomics.model.ReadGroupSet;
import com.google.api.services.genomics.model.Reference;
import com.google.api.services.genomics.model.ReferenceBound;
import com.google.api.services.genomics.model.SearchCallSetsRequest;
import com.google.api.services.genomics.model.SearchReadGroupSetsRequest;
import com.google.api.services.genomics.model.SearchReferencesRequest;
import com.google.api.services.genomics.model.SearchVariantSetsRequest;
import com.google.api.services.genomics.model.VariantSet;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;

/**
Expand Down Expand Up @@ -56,7 +61,64 @@ public static List<String> getReadGroupSetIds(String datasetId, GenomicsFactory.
}
return output;
}

/**
* Gets the ReferenceSetId for a given readGroupSetId using the Genomics API.
*
* @param readGroupSetId The id of the readGroupSet to query.
* @param auth The OfflineAuth for the API request.
* @return The referenceSetId for the redGroupSet (which may be null).
* @throws IOException
* @throws GeneralSecurityException
*/
public static String getReferenceSetId(String readGroupSetId, GenomicsFactory.OfflineAuth auth)
throws IOException, GeneralSecurityException {
Genomics genomics = auth.getGenomics(auth.getDefaultFactory());
ReadGroupSet readGroupSet = genomics.readgroupsets().get(readGroupSetId)
.setFields("referenceSetId").execute();
return readGroupSet.getReferenceSetId();
}

/**
* Gets the CoverageBuckets for a given readGroupSetId using the Genomics API.
*
* @param readGroupSetId The id of the readGroupSet to query.
* @param auth The OfflineAuth for the API request.
* @return The list of reference bounds in the variantSet.
* @throws IOException
* @throws GeneralSecurityException
*/
public static List<CoverageBucket> getCoverageBuckets(String readGroupSetId, GenomicsFactory.OfflineAuth auth)
throws IOException, GeneralSecurityException {
Genomics genomics = auth.getGenomics(auth.getDefaultFactory());
ListCoverageBucketsResponse response =
genomics.readgroupsets().coveragebuckets().list(readGroupSetId).execute();
// Requests of this form return one result per reference name, so therefore many fewer than
// the default page size, but verify that the assumption holds true.
if (!Strings.isNullOrEmpty(response.getNextPageToken())) {
throw new IllegalArgumentException("Read group set " + readGroupSetId
+ " has more Coverage Buckets than the default page size for the CoverageBuckets list operation.");
}
return response.getCoverageBuckets();
}


/**
* Gets the references for a given referenceSetId using the Genomics API.
*
* @param referenceSetId The id of the referenceSet to query.
* @param auth The OfflineAuth for the API request.
* @return The list of references in the referenceSet.
* @throws IOException
* @throws GeneralSecurityException
*/
public static Iterable<Reference> getReferences(String referenceSetId, GenomicsFactory.OfflineAuth auth)
throws IOException, GeneralSecurityException {
Genomics genomics = auth.getGenomics(auth.getDefaultFactory());
return Paginator.References.create(
genomics).search(new SearchReferencesRequest().setReferenceSetId(referenceSetId));
}

/**
* Gets VariantSetIds from a given datasetId using the Genomics API.
*
Expand Down Expand Up @@ -122,6 +184,5 @@ public static List<ReferenceBound> getReferenceBounds(String variantSetId, Genom
Genomics genomics = auth.getGenomics(auth.getDefaultFactory());
VariantSet variantSet = genomics.variantsets().get(variantSetId).execute();
return variantSet.getReferenceBounds();
}

}
}
83 changes: 60 additions & 23 deletions src/main/java/com/google/cloud/genomics/utils/ShardUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;

import com.google.api.services.genomics.model.CoverageBucket;
import com.google.api.services.genomics.model.ReferenceBound;
import com.google.api.services.genomics.model.SearchReadsRequest;
import com.google.api.services.genomics.model.SearchVariantsRequest;
Expand Down Expand Up @@ -71,6 +73,7 @@ public enum SexChromosomeFilter {
*/
EXCLUDE_XY
}
public static final Pattern SEX_CHROMOSOME_REGEXP = Pattern.compile("^(chr)?[XY]$", Pattern.CASE_INSENSITIVE);

/**
* Constructs sharded StreamVariantsRequests for the specified contiguous region(s) of the genome.
Expand Down Expand Up @@ -232,29 +235,36 @@ public SearchReadsRequest apply(Contig shard) {
}

/**
* Constructs StreamReadsRequest for the readGroupSetIds, assuming that the user wants to
* include all references.
* Constructs sharded StreamReadsRequest for the all references in the readGroupSet.
*
* TODO: Should this be sharded - by the referenceBounds for the associated referenceSet
* and/or by read groups?
*
* @param readGroupSetIds The readGroupSetIds.
* @param readGroupSetId The readGroupSetId.
* @param sexChromosomeFilter An enum value indicating how sex chromosomes should be
* handled in the result.
* @param numberOfBasesPerShard The maximum number of bases to include per shard.
* @param auth The OfflineAuth to be used to get the reference bounds for the variantSet.
* @return The shuffled list of sharded request objects.
* @throws IOException
* @throws GeneralSecurityException
*/
public static ImmutableList<StreamReadsRequest> getReadRequests(List<String> readGroupSetIds) {
// Work around lack of FluentIterable.shuffle() https://github.com/google/guava/issues/1358
List<StreamReadsRequest> requests =
Arrays.asList(FluentIterable.from(readGroupSetIds)
.transform(new Function<String, StreamReadsRequest>() {
public static ImmutableList<StreamReadsRequest> getReadRequests(final String readGroupSetId,
SexChromosomeFilter sexChromosomeFilter, long numberOfBasesPerShard,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like there may be more use cases for custom Filters. No need to do now, but if you think other filters make sense here, consider changing to a Filter superclass and dealing with a list of filters that the user can apply.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with that approach.

I filed #65 for when we have other specific filters we want to add at this level where we construct shards.

GenomicsFactory.OfflineAuth auth) throws IOException, GeneralSecurityException {
Iterable<Contig> shards = getAllShardsInReadGroupSet(readGroupSetId, sexChromosomeFilter,
numberOfBasesPerShard, auth);
return FluentIterable.from(shards)
.transform(new Function<Contig, StreamReadsRequest>() {
@Override
public StreamReadsRequest apply(String readGroupSetId) {
return StreamReadsRequest.newBuilder()
.setReadGroupSetId(readGroupSetId)
.build();
public StreamReadsRequest apply(Contig shard) {
return shard.getStreamReadsRequest(readGroupSetId);
}
}).toArray(StreamReadsRequest.class));
Collections.shuffle(requests);
return FluentIterable.from(requests).toList();
}).toList();
}

private static List<Contig> getAllShardsInVariantSet(String variantSetId,
SexChromosomeFilter sexChromosomeFilter, long numberOfBasesPerShard,
GenomicsFactory.OfflineAuth auth) throws IOException, GeneralSecurityException {
List<Contig> contigs = getContigsInVariantSet(variantSetId, sexChromosomeFilter, auth);
return ShardUtils.getAllShardsForContigs(contigs, numberOfBasesPerShard);
}

/**
Expand All @@ -274,9 +284,8 @@ private static List<Contig> getContigsInVariantSet(String variantSetId,
throws IOException, GeneralSecurityException {
List<Contig> contigs = Lists.newArrayList();
for (ReferenceBound bound : GenomicsUtils.getReferenceBounds(variantSetId, auth)) {
String contig = bound.getReferenceName().toLowerCase();
if (sexChromosomeFilter == SexChromosomeFilter.EXCLUDE_XY
&& (contig.contains("x") || contig.contains("y"))) {
&& SEX_CHROMOSOME_REGEXP.matcher(bound.getReferenceName()).matches()) {
// X and Y can skew some analysis results
continue;
}
Expand All @@ -285,13 +294,41 @@ private static List<Contig> getContigsInVariantSet(String variantSetId,
return contigs;
}

private static List<Contig> getAllShardsInVariantSet(String variantSetId,
private static List<Contig> getAllShardsInReadGroupSet(String readGroupSetId,
SexChromosomeFilter sexChromosomeFilter, long numberOfBasesPerShard,
GenomicsFactory.OfflineAuth auth) throws IOException, GeneralSecurityException {
List<Contig> contigs = getContigsInVariantSet(variantSetId, sexChromosomeFilter, auth);
List<Contig> contigs = getContigsInReadGroupSet(readGroupSetId, sexChromosomeFilter, auth);
return ShardUtils.getAllShardsForContigs(contigs, numberOfBasesPerShard);
}


/**
* Retrieve the list of all the reference names and their start=0/end positions for the ranges of
* the coverage buckets computed for this readGroupSet.
*
* @param readGroupSetId - The id of the readGroupSet to query.
* @param sexChromosomeFilter - An enum value indicating how sex chromosomes should be
* handled in the result.
* @return The list of all references in the readGroupSet.
* @throws IOException
* @throws GeneralSecurityException
*/
private static List<Contig> getContigsInReadGroupSet(String readGroupSetId,
SexChromosomeFilter sexChromosomeFilter, GenomicsFactory.OfflineAuth auth)
throws IOException, GeneralSecurityException {
List<Contig> contigs = Lists.newArrayList();
for (CoverageBucket bucket : GenomicsUtils.getCoverageBuckets(readGroupSetId, auth)) {
if (sexChromosomeFilter == SexChromosomeFilter.EXCLUDE_XY
&& SEX_CHROMOSOME_REGEXP.matcher(bucket.getRange().getReferenceName()).matches()) {
// X and Y can skew some analysis results
continue;
}
contigs.add(new Contig(bucket.getRange().getReferenceName(),
(null == bucket.getRange().getStart()) ? 0 : bucket.getRange().getStart(),
bucket.getRange().getEnd()));
}
return contigs;
}

private static List<Contig> getSpecifiedShards(String contigsArgument, long numberOfBasesPerShard) {
Iterable<Contig> contigs = Contig.parseContigsFromCommandLine(contigsArgument);
return ShardUtils.getAllShardsForContigs(contigs, numberOfBasesPerShard);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package com.google.cloud.genomics.utils;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.security.GeneralSecurityException;
Expand All @@ -39,6 +40,12 @@ public void testGetReadGroupSetIds() throws IOException, GeneralSecurityExceptio
CoreMatchers.allOf(CoreMatchers.hasItems(helper.PLATINUM_GENOMES_READGROUPSETS)));
}

@Test
public void testGetReferenceSetIdForReadGroupSet() throws IOException, GeneralSecurityException {
assertEquals(helper.PLATINUM_GENOMES_REFERENCE_SET_ID,
GenomicsUtils.getReferenceSetId(helper.PLATINUM_GENOMES_READGROUPSETS[0], helper.getAuth()));
}

@Test
public void testGetVariantSetIds() throws IOException, GeneralSecurityException {
assertThat(GenomicsUtils.getVariantSetIds(helper.PLATINUM_GENOMES_DATASET, helper.getAuth()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ public class IntegrationTestHelper {
"CMvnhpKTFhCUpIDDveWE-r0B",
"CMvnhpKTFhCrvIOEw4Ol__sB",
};
public static final String PLATINUM_GENOMES_REFERENCE_SET_ID = "CNfS6aHAoved2AEQ6PnzkOzw15rqAQ";
public static final String PLATINUM_GENOMES_BRCA1_REFERENCES = "chr17:41196311:41277499";
public static final String PLATINUM_GENOMES_KLOTHO_REFERENCES = "chr13:33628137:33628138";
public static final ReferenceBound[] PLATINUM_GENOMES_VARIANTSET_BOUNDS = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@

import java.io.IOException;
import java.security.GeneralSecurityException;
import java.util.Arrays;

import org.hamcrest.CoreMatchers;
import org.junit.BeforeClass;
import org.junit.Test;

import com.google.cloud.genomics.utils.ShardUtils.SexChromosomeFilter;
import com.google.genomics.v1.StreamReadsRequest;
import com.google.genomics.v1.StreamVariantsRequest;

public class ShardUtilsITCase {
Expand Down Expand Up @@ -121,4 +123,92 @@ public void testGetVariantRequestsStringSexChromosomeFilterLongOfflineAuth() thr
CoreMatchers.allOf(CoreMatchers.hasItems(EXPECTED_RESULT),
CoreMatchers.hasItems(EXPECTED_RESULT_XY)));
}

@Test
public void testGetReadRequestsStringSexChromosomeFilterLongOfflineAuth() throws IOException, GeneralSecurityException {

StreamReadsRequest[] EXPECTED_RESULT_XY = {
new Contig("chrX", 0L, 150000000L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chrX", 150000000L, 155270560L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chrY", 0L, 59373566L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0])
};

StreamReadsRequest[] EXPECTED_RESULT = {
new Contig("chr1", 0L, 150000000L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr1", 150000000L, 249250621L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr10", 0L, 135534747L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr11", 0L, 135006516L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr12", 0L, 133851895L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr13", 0L, 115169878L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr14", 0L, 107349540L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr15", 0L, 102531392L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr16", 0L, 90354753L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr17", 0L, 81195210L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr18", 0L, 78077248L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr19", 0L, 59128983L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr2", 0L, 150000000L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr2", 150000000L, 243199373L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr20", 0L, 63025520L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr21", 0L, 48129895L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr22", 0L, 51304566L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr3", 0L, 150000000L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr3", 150000000L, 198022430L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr4", 0L, 150000000L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr4", 150000000L, 191154276L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr5", 0L, 150000000L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr5", 150000000L, 180915260L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr6", 0L, 150000000L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr6", 150000000L, 171115067L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr7", 0L, 150000000L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr7", 150000000L, 159138663L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr8", 0L, 146364022L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chr9", 0L, 141213431L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0]),
new Contig("chrM", 0L, 16571L)
.getStreamReadsRequest(helper.PLATINUM_GENOMES_READGROUPSETS[0])
};

// These shards are "too big" to use in practice but for this test it keeps the
// expected result from getting crazy long.
assertThat(ShardUtils.getReadRequests(helper.PLATINUM_GENOMES_READGROUPSETS[0],
SexChromosomeFilter.EXCLUDE_XY, 150000000L, helper.getAuth()),
CoreMatchers.allOf(CoreMatchers.hasItems(EXPECTED_RESULT)));

// Include sex chromosomes this time.
assertThat(ShardUtils.getReadRequests(helper.PLATINUM_GENOMES_READGROUPSETS[0],
SexChromosomeFilter.INCLUDE_XY, 150000000L, helper.getAuth()),
CoreMatchers.allOf(CoreMatchers.hasItems(EXPECTED_RESULT),
CoreMatchers.hasItems(EXPECTED_RESULT_XY)));
}
}
23 changes: 13 additions & 10 deletions src/test/java/com/google/cloud/genomics/utils/ShardUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.not;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.util.Arrays;
import java.util.List;
Expand Down Expand Up @@ -86,16 +88,6 @@ public void testGetPaginatedReadRequests() {
CoreMatchers.allOf(CoreMatchers.hasItems(EXPECTED_RESULT)));
}

@Test
public void testGetReadRequestsListOfString() {
final StreamReadsRequest[] EXPECTED_RESULT = {
StreamReadsRequest.newBuilder().setReadGroupSetId("readset1").build(),
StreamReadsRequest.newBuilder().setReadGroupSetId("readset2").build(),
};
assertThat(ShardUtils.getReadRequests(Arrays.asList("readset1", "readset2")),
CoreMatchers.allOf(CoreMatchers.hasItems(EXPECTED_RESULT)));
}

@Test
public void testVariantShardsAreShuffled() throws Exception {
final StreamVariantsRequest[] EXPECTED_RESULT = {
Expand Down Expand Up @@ -177,4 +169,15 @@ public void testReadShardsAreShuffled() throws Exception {
// Lists have different orders for their elements.
assertThat(requests, is(not(requests2)));
}

@Test
public void testSexChromosomeRegexp() {
assertTrue(ShardUtils.SEX_CHROMOSOME_REGEXP.matcher("chrX").matches());
assertTrue(ShardUtils.SEX_CHROMOSOME_REGEXP.matcher("chrY").matches());
assertTrue(ShardUtils.SEX_CHROMOSOME_REGEXP.matcher("X").matches());
assertTrue(ShardUtils.SEX_CHROMOSOME_REGEXP.matcher("Y").matches());
assertTrue(ShardUtils.SEX_CHROMOSOME_REGEXP.matcher("x").matches());
assertTrue(ShardUtils.SEX_CHROMOSOME_REGEXP.matcher("y").matches());
assertFalse(ShardUtils.SEX_CHROMOSOME_REGEXP.matcher("chr6_cox_hap2").matches());
}
}