diff --git a/.gitignore b/.gitignore index 3b1abaf..6cc0a52 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ bin .classpath .project - +.jar_tmp +classes +dist +gatk-tools-java.version.properties diff --git a/build.xml b/build.xml new file mode 100755 index 0000000..6735451 --- /dev/null +++ b/build.xml @@ -0,0 +1,145 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/htsjdk-1.118.jar b/lib/htsjdk-1.121.jar similarity index 79% rename from lib/htsjdk-1.118.jar rename to lib/htsjdk-1.121.jar index a19810b..128d936 100644 Binary files a/lib/htsjdk-1.118.jar and b/lib/htsjdk-1.121.jar differ diff --git a/src/main/java/com/google/cloud/genomics/gatk/common/GA4GHUrl.java b/src/main/java/com/google/cloud/genomics/gatk/common/GA4GHUrl.java index 269e684..8e677f3 100644 --- a/src/main/java/com/google/cloud/genomics/gatk/common/GA4GHUrl.java +++ b/src/main/java/com/google/cloud/genomics/gatk/common/GA4GHUrl.java @@ -16,11 +16,12 @@ package com.google.cloud.genomics.gatk.common; import java.net.URISyntaxException; +import java.net.URL; /** * Represents a GA4GH reads resource as a URL in the form of * ga4gh:///readsets///[start-end], - * e.g. ga4gh://www.googleapis.com/genomics/v1beta/reads/CLqN8Z3sDRDQldHJ_rTS9VE/1/ + * e.g. ga4gh://www.googleapis.com/genomics/v1beta/readsets/CLqN8Z3sDRDQldHJ_rTS9VE/1/ */ public class GA4GHUrl { int rangeStart = 0; @@ -53,6 +54,10 @@ public GA4GHUrl(String rootUrl, this.rangeEnd = rangeEnd; } + public GA4GHUrl(URL input) throws URISyntaxException { + this(input.toString().replace("https://", GA4GH_SCHEMA_PREFIX)); + } + public GA4GHUrl(String input) throws URISyntaxException { if (!isGA4GHUrl(input)) { throw new URISyntaxException(input, "Schema is not ga4gh"); @@ -67,7 +72,7 @@ public GA4GHUrl(String input) throws URISyntaxException { String[] pathComponents = readsPath.split("/"); if (pathComponents.length < 4) { throw new URISyntaxException(input, - "Expecting " + READS_PATH_COMPONENT +"readset/sequence/[range], got " + "Expecting " + READS_PATH_COMPONENT + "readset/sequence/[range], got " + readsPath); } diff --git a/src/main/java/com/google/cloud/genomics/gatk/common/GenomicsApiDataSource.java b/src/main/java/com/google/cloud/genomics/gatk/common/GenomicsApiDataSource.java index 2f845d8..df75159 100644 --- a/src/main/java/com/google/cloud/genomics/gatk/common/GenomicsApiDataSource.java +++ b/src/main/java/com/google/cloud/genomics/gatk/common/GenomicsApiDataSource.java @@ -15,10 +15,15 @@ */ package com.google.cloud.genomics.gatk.common; +import com.google.api.client.auth.oauth2.Credential; import com.google.api.client.extensions.java6.auth.oauth2.VerificationCodeReceiver; import com.google.api.client.extensions.jetty.auth.oauth2.LocalServerReceiver; import com.google.api.client.googleapis.extensions.java6.auth.oauth2.GooglePromptReceiver; +import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport; import com.google.api.client.googleapis.json.GoogleJsonResponseException; +import com.google.api.client.http.HttpRequest; +import com.google.api.client.http.HttpRequestInitializer; +import com.google.api.client.json.jackson2.JacksonFactory; import com.google.api.services.genomics.Genomics; import com.google.api.services.genomics.GenomicsScopes; import com.google.api.services.genomics.model.Dataset; @@ -70,32 +75,49 @@ private Genomics getApi() throws GeneralSecurityException, IOException { } private Genomics initGenomicsApi() throws GeneralSecurityException, IOException { - File clientSecrets = new File(clientSecretsFilename); - if (!clientSecrets.exists()) { - throw new IOException( - "Client secrets file " + clientSecretsFilename + " does not exist." - + " Visit https://developers.google.com/genomics to learn how" - + " to install a client_secrets.json file. If you have installed a client_secrets.json" - + " in a specific location, use --client_secrets_filename /client_secrets.json."); + LOG.info("Initializing Genomics API for " + rootUrl); + if (!clientSecretsFilename.isEmpty()) { + File clientSecrets = new File(clientSecretsFilename); + if (!clientSecrets.exists()) { + throw new IOException( + "Client secrets file " + clientSecretsFilename + " does not exist." + + " Visit https://developers.google.com/genomics to learn how" + + " to install a client_secrets.json file. If you have installed a client_secrets.json" + + " in a specific location, use --client_secrets_filename /client_secrets.json."); + } + LOG.info("Using client secrets file " + clientSecretsFilename); + + VerificationCodeReceiver receiver = noLocalServer ? + new GooglePromptReceiver() : new LocalServerReceiver(); + GenomicsFactory genomicsFactory = GenomicsFactory + .builder("genomics_java_client") + .setScopes(SCOPES) + .setRootUrl(rootUrl) + .setServicePath("/") + .setVerificationCodeReceiver(Suppliers.ofInstance(receiver)) + .build(); + return genomicsFactory.fromClientSecretsFile(clientSecrets); + } else { + final Genomics.Builder builder = new Genomics + .Builder( + GoogleNetHttpTransport.newTrustedTransport(), + JacksonFactory.getDefaultInstance(), + new HttpRequestInitializer() { + @Override public void initialize(HttpRequest httpRequest) throws IOException { + httpRequest.setReadTimeout(20000); + httpRequest.setConnectTimeout(20000); + } + }) + .setApplicationName("genomics_java_client") + .setRootUrl(rootUrl) + .setServicePath("/"); + return builder.build(); } - - VerificationCodeReceiver receiver = noLocalServer ? - new GooglePromptReceiver() : new LocalServerReceiver(); - - GenomicsFactory genomicsFactory = GenomicsFactory - .builder("genomics_java_client") - .setScopes(SCOPES) - .setUserName("user" + SCOPES.size()) - .setVerificationCodeReceiver(Suppliers.ofInstance(receiver)) - .setRootUrl(rootUrl) - .setServicePath("/") - .build(); - - return genomicsFactory.fromClientSecretsFile(clientSecrets); } public ReadIteratorResource getReadsFromGenomicsApi(GA4GHUrl url) throws IOException, GeneralSecurityException { + LOG.info("Getting reads from " + url); return getReadsFromGenomicsApi(url.getReadset(), url.getSequence(), url.getRangeStart(), url.getRangeEnd()); } @@ -103,6 +125,8 @@ public ReadIteratorResource getReadsFromGenomicsApi(GA4GHUrl url) public ReadIteratorResource getReadsFromGenomicsApi(String readsetId, String sequenceName, int sequenceStart, int sequenceEnd) throws IOException, GeneralSecurityException { + LOG.info("Getting readset " + readsetId + ", sequence " + sequenceName + + ", start=" + sequenceStart + ", end=" + sequenceEnd); final Genomics stub = getApi(); // TODO(iliat): implement API retries and using access key for public // datasets diff --git a/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHQueryInterval.java b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHQueryInterval.java new file mode 100644 index 0000000..b3617e5 --- /dev/null +++ b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHQueryInterval.java @@ -0,0 +1,82 @@ +package com.google.cloud.genomics.gatk.htsjdk; + +import htsjdk.samtools.util.CoordMath; +import htsjdk.samtools.SAMRecord; + +/** + * Similar to HTSJDK's QueryInterval but allows specifying sequence name + * (as opposed to index in the header) and adds ability to check if a given read + * matches the interval. + */ +public class GA4GHQueryInterval { + private String sequence; + private int start; + private int end; + + public enum ReadPositionConstraint { + OVERLAPPING, + CONTAINED, + START_AT + } + private ReadPositionConstraint readPositionConstraint; + + public GA4GHQueryInterval(String sequence, int start, int end, + ReadPositionConstraint readPositionConstraint) { + super(); + this.sequence = sequence; + this.start = start; + this.end = end; + this.readPositionConstraint = readPositionConstraint; + } + + public String getSequence() { + return sequence; + } + + public void setSequence(String sequence) { + this.sequence = sequence; + } + + public int getStart() { + return start; + } + + public void setStart(int start) { + this.start = start; + } + + public int getEnd() { + return end; + } + + public void setEnd(int end) { + this.end = end; + } + + public ReadPositionConstraint getReadPositionConstraint() { + return readPositionConstraint; + } + + public void setReadPositionConstraint(ReadPositionConstraint readPositionConstraint) { + this.readPositionConstraint = readPositionConstraint; + } + + /** + * Returns true iff the read specified by the record matches the interval + * given the interval's constraints and the read position. + */ + public boolean matches(SAMRecord record) { + int myEnd = end == 0 ? Integer.MAX_VALUE : end; + switch (readPositionConstraint) { + case OVERLAPPING: + return CoordMath.overlaps(start, myEnd, + record.getAlignmentStart(), record.getAlignmentEnd()); + case CONTAINED: + return CoordMath.encloses(start, myEnd, + record.getAlignmentStart(), record.getAlignmentEnd()); + case START_AT: + return start == record.getAlignmentStart(); + } + return false; + } +} diff --git a/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHReaderFactory.java b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHReaderFactory.java new file mode 100644 index 0000000..507a5f6 --- /dev/null +++ b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHReaderFactory.java @@ -0,0 +1,25 @@ +package com.google.cloud.genomics.gatk.htsjdk; + +import htsjdk.samtools.CustomReaderFactory; +import htsjdk.samtools.SamReader; + +import java.net.URL; +import java.util.logging.Logger; +/** + * HTSJDK CustomReaderFactory implementation. + * Returns a SamReader that reads data from GA4GH API. + */ +public class GA4GHReaderFactory implements CustomReaderFactory.ICustomReaderFactory { + private static final Logger LOG = Logger.getLogger(GA4GHReaderFactory.class.getName()); + + @Override + public SamReader open(URL url) { + try { + return new GA4GHSamReader(url); + } catch (Exception ex) { + LOG.warning("Error creating SamReader " + ex.toString()); + return null; + } + } + +} diff --git a/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHSamReader.java b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHSamReader.java new file mode 100644 index 0000000..21c06d6 --- /dev/null +++ b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHSamReader.java @@ -0,0 +1,182 @@ +package com.google.cloud.genomics.gatk.htsjdk; + +import com.google.cloud.genomics.gatk.common.GA4GHUrl; +import com.google.cloud.genomics.gatk.common.GenomicsApiDataSource; +import com.google.cloud.genomics.gatk.common.GenomicsApiDataSourceFactory; +import com.google.cloud.genomics.gatk.common.GenomicsApiDataSourceFactory.Settings; + +import htsjdk.samtools.QueryInterval; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFormatException; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.util.CloseableIterator; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.URL; +import java.security.GeneralSecurityException; + +/** + * SamReader implementation that reads data from GA4GH API. + * For client_secrets file, specify the path in the ga4gh.client_secrets system property. + */ +public class GA4GHSamReader implements SamReader { + private GA4GHUrl url; + private GenomicsApiDataSourceFactory factory; + GenomicsApiDataSource dataSource; + GA4GHSamRecordIterator iterator; + + public GA4GHSamReader(URL url) throws URISyntaxException, IOException, GeneralSecurityException { + this.url = new GA4GHUrl(url); + this.factory = new GenomicsApiDataSourceFactory(); + factory.configure(this.url.getRootUrl(), + new Settings( + System.getProperty("ga4gh.client_secrets", "client_secrets.json"), + System.getProperty("ga4gh.no_local_server","") + .toLowerCase().equals("true"))); + dataSource = factory.get(this.url.getRootUrl()); + queryOverlapping(this.url.getSequence(), this.url.getRangeStart(), + this.url.getRangeEnd()); + } + + @Override + public void close() throws IOException { + this.dataSource = null; + this.factory = null; + } + + @Override + public SAMFileHeader getFileHeader() { + return iterator.getFileHeader(); + } + + @Override + public Type type() { + // TODO(iliat): figure out bearing of this on indexing + return Type.SAM_TYPE; + } + + @Override + public boolean hasIndex() { + return true; + } + + @Override + public Indexing indexing() { + return null; + } + + @Override + public SAMRecordIterator queryContained(String sequence, int start, int end) { + return query(sequence, start, end, true); + } + + @Override + public SAMRecordIterator queryOverlapping(String sequence, int start, int end) { + return query(sequence, start, end, false); + } + + @Override + public SAMRecordIterator queryContained(QueryInterval[] intervals) { + return query(intervals, true); + } + + @Override + public SAMRecordIterator queryOverlapping(QueryInterval[] intervals) { + return query(intervals, false); + } + + @Override + public SAMRecordIterator queryUnmapped() { + return queryOverlapping("*", 0, 0); + } + + @Override + public SAMRecordIterator query(QueryInterval[] intervals, boolean contained) { + final GA4GHQueryInterval[] myIntervals = new GA4GHQueryInterval[intervals.length]; + final GA4GHQueryInterval.ReadPositionConstraint constraint = contained ? + GA4GHQueryInterval.ReadPositionConstraint.CONTAINED : + GA4GHQueryInterval.ReadPositionConstraint.OVERLAPPING; + final SAMFileHeader header = getFileHeader(); + for (int i = 0; i < intervals.length; i++) { + final QueryInterval interval = intervals[i]; + final String sequence = header.getSequence( + interval.referenceIndex).getSequenceName(); + myIntervals[i] = new GA4GHQueryInterval(sequence, interval.start, + interval.end, constraint); + } + return query(myIntervals); + } + + @Override + public SAMRecordIterator query(String sequence, int start, int end, boolean contained) { + return query(new GA4GHQueryInterval[] { + new GA4GHQueryInterval(sequence, start, end, contained ? + GA4GHQueryInterval.ReadPositionConstraint.CONTAINED : + GA4GHQueryInterval.ReadPositionConstraint.OVERLAPPING)}); + } + + @Override + public SAMRecordIterator queryAlignmentStart(String sequence, int start) { + return query(new GA4GHQueryInterval[] { + new GA4GHQueryInterval(sequence, start, 0, + GA4GHQueryInterval.ReadPositionConstraint.START_AT)}); + } + + @Override + public SAMRecord queryMate(SAMRecord rec) { + if (!rec.getReadPairedFlag()) { + throw new IllegalArgumentException("queryMate called for unpaired read."); + } + if (rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) { + throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both."); + } + final boolean firstOfPair = rec.getFirstOfPairFlag(); + final CloseableIterator it; + if (rec.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + it = queryUnmapped(); + } else { + it = queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart()); + } + try { + SAMRecord mateRec = null; + while (it.hasNext()) { + final SAMRecord next = it.next(); + if (!next.getReadPairedFlag()) { + if (rec.getReadName().equals(next.getReadName())) { + throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName()); + } + continue; + } + if (firstOfPair) { + if (next.getFirstOfPairFlag()) continue; + } else { + if (next.getSecondOfPairFlag()) continue; + } + if (rec.getReadName().equals(next.getReadName())) { + if (mateRec != null) { + throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() + + " for " + (firstOfPair ? "second" : "first") + " end."); + } + mateRec = next; + } + } + return mateRec; + } finally { + it.close(); + } + } + + public SAMRecordIterator query(GA4GHQueryInterval[] intervals) { + iterator = new GA4GHSamRecordIterator(dataSource, url.getReadset(), + intervals); + return iterator(); + } + + @Override + public SAMRecordIterator iterator() { + return iterator; + } +} diff --git a/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHSamRecordIterator.java b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHSamRecordIterator.java new file mode 100644 index 0000000..2814b40 --- /dev/null +++ b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/GA4GHSamRecordIterator.java @@ -0,0 +1,148 @@ +package com.google.cloud.genomics.gatk.htsjdk; + +import com.google.cloud.genomics.gatk.common.GenomicsApiDataSource; +import com.google.cloud.genomics.gatk.common.ReadIteratorResource; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileHeader.SortOrder; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; + +import java.util.Iterator; +import java.util.logging.Logger; + +/** + * Wraps iterators provided from Genomics API and implements + * HTSJDK's SAMRecordIterator. + * Iterates over data returned from the API and when needed + * re-queries the API for more data. + * Since the API always return *overalpping* reads and SAMRecordIterator + * supports contained and start-at queries, this class filters reads + * returned from the API to make sure they conform to the requested intervals. + */ +public class GA4GHSamRecordIterator implements SAMRecordIterator{ + private static final Logger LOG = Logger.getLogger(GA4GHSamRecordIterator.class.getName()); + + Iterator iterator; + GenomicsApiDataSource dataSource; + GA4GHQueryInterval[] intervals; + String readSetId; + int intervalIndex = -1; + boolean hasNext; + SAMRecord nextRead; + SAMFileHeader header; + + public GA4GHSamRecordIterator(GenomicsApiDataSource dataSource, + String readSetId, + GA4GHQueryInterval[] intervals) { + this.dataSource = dataSource; + this.readSetId = readSetId; + this.intervals = intervals; + seekMatchingRead(); + } + + /** Returns true when we truly reached the end of all requested data */ + boolean isAtEnd() { + return intervals == null || intervals.length == 0 || + intervalIndex >= intervals.length; + } + + /** Returns the current interval being processed or null if we have reached the end */ + GA4GHQueryInterval currentInterval() { + if (isAtEnd()) { + return null; + } + return intervals[intervalIndex]; + } + + /** Re-queries the API for the next interval */ + ReadIteratorResource queryNextInterval() { + if (!isAtEnd()) { + intervalIndex++; + } + if (isAtEnd()) { + return null; + } + return queryForInterval(currentInterval()); + } + + /** Queries the API for an interval and returns the iterator resource, or null if failed */ + ReadIteratorResource queryForInterval(GA4GHQueryInterval interval) { + try { + return dataSource.getReadsFromGenomicsApi(readSetId, interval.getSequence(), + interval.getStart(), interval.getEnd()); + } catch (Exception ex) { + LOG.warning("Error getting data for interval " + ex.toString()); + } + return null; + } + + /** + * Ensures next returned read will match the currently requested interval. + * Since the API always returns overlapping reads we might need to skip some + * reads if the interval asks for "included" or "starts at" types. + * Also deals with the case of iterator being at an end and needing to query + * for the next interval. + */ + void seekMatchingRead() { + while (!isAtEnd()) { + if (iterator == null || !iterator.hasNext()) { + LOG.info("Getting next interval from the API"); + // We have hit an end (or this is first time) so we need to go fish + // to the API. + ReadIteratorResource resource = queryNextInterval(); + if (resource != null) { + LOG.info("Got next interval from the API"); + header = resource.getSAMFileHeader(); + iterator = resource.getSAMRecordIterable().iterator(); + } else { + LOG.info("Failed to get next interval from the API"); + header = null; + iterator = null; + } + } else { + nextRead = iterator.next(); + if (currentInterval().matches(nextRead)) { + return; // Happy case, otherwise we keep spinning in the loop. + } else { + LOG.info("Skipping non matching read"); + } + } + } + } + + + @Override + public void close() { + this.iterator = null; + this.dataSource = null; + this.intervalIndex = intervals.length; + } + + @Override + public boolean hasNext() { + return !isAtEnd(); + } + + @Override + public SAMRecord next() { + SAMRecord retVal = nextRead; + seekMatchingRead(); + return retVal; + } + + @Override + public void remove() { + // Not implemented + } + + @Override + public SAMRecordIterator assertSorted(SortOrder sortOrder) { + // TODO(iliat): implement this properly. This code never checks anything. + return this; + } + + public SAMFileHeader getFileHeader() { + return header; + } +} diff --git a/src/main/java/com/google/cloud/genomics/gatk/htsjdk/SamReaderExample.java b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/SamReaderExample.java new file mode 100644 index 0000000..579c059 --- /dev/null +++ b/src/main/java/com/google/cloud/genomics/gatk/htsjdk/SamReaderExample.java @@ -0,0 +1,42 @@ +package com.google.cloud.genomics.gatk.htsjdk; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; + +import java.net.MalformedURLException; +import java.net.URL; + +/** + * Example of HTSJDK SamReader and SamReaderFactory class usage. + * Illustrates how to plug in a custom SamReaderFactory in order to consume + * data from ga4gh urls. + * + * To run this we need to specify the custom reader factory for HTSJDK and set + * client_secrets file path for GenomGenomics API: + * -Dsamjdk.custom_reader=https://www.googleapis.com/genomics,com.google.cloud.genomics.gatk.htsjdk.GA4GHReaderFactory + * -Dga4gh.client_secrets= + */ +public class SamReaderExample { + static String GA4GH_URL = + "https://www.googleapis.com/genomics/v1beta/readsets/CLqN8Z3sDRCwgrmdkOXjn_sB/*/"; + + public static void main(String[] args) { + try { + SamReaderFactory factory = SamReaderFactory.makeDefault(); + + // If it was a file, we would open like so: + // factory.open(new File("~/testdata/htsjdk/samtools/uncompressed.sam")); + // For API access we use SamInputResource constructed from a URL: + SamReader reader = factory.open(SamInputResource.of(new URL(GA4GH_URL))); + + for (final SAMRecord samRecord : reader) { + System.err.println(samRecord); + } + + } catch (MalformedURLException e) { + e.printStackTrace(); + } + } +}