forked from bigdatagenomics/adam
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ADAM-1112] Add interface for piping commands.
Resolves bigdatagenomics#1112. Adds interfaces for piping SAM/BAM and VCF to subprocesses that are run in parallel under Apache Spark.
- Loading branch information
Showing
16 changed files
with
694 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/** | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.bdgenomics.adam.rdd | ||
|
||
import java.io.OutputStream | ||
|
||
private[rdd] class InFormatterRunner[T, U <: GenomicRDD[T, U], V <: InFormatter[T, U, V]](iter: Iterator[T], | ||
formatter: V, | ||
os: OutputStream) extends Runnable { | ||
|
||
def run() { | ||
formatter.write(os, iter) | ||
os.flush() | ||
os.close() | ||
} | ||
} | ||
|
||
trait InFormatterCompanion[T, U <: GenomicRDD[T, U], V <: InFormatter[T, U, V]] { | ||
|
||
def apply(gRdd: U): V | ||
} | ||
|
||
/** | ||
* Formats data going into a pipe to an invoked process. | ||
* | ||
* @tparam T The type of records being formatted. | ||
*/ | ||
trait InFormatter[T, U <: GenomicRDD[T, U], V <: InFormatter[T, U, V]] extends Serializable { | ||
|
||
protected val companion: InFormatterCompanion[T, U, V] | ||
|
||
/** | ||
* Writes records from an iterator into an output stream. | ||
* | ||
* @param os An OutputStream connected to a process we are piping to. | ||
* @param iter An iterator of records to write. | ||
*/ | ||
def write(os: OutputStream, iter: Iterator[T]) | ||
} | ||
|
47 changes: 47 additions & 0 deletions
47
adam-core/src/main/scala/org/bdgenomics/adam/rdd/OutFormatter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/** | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.bdgenomics.adam.rdd | ||
|
||
import java.io.InputStream | ||
import java.util.concurrent.Callable | ||
|
||
private[rdd] class OutFormatterRunner[T, U <: OutFormatter[T]](formatter: U, | ||
is: InputStream) extends Callable[Iterator[T]] { | ||
|
||
def call(): Iterator[T] = { | ||
formatter.read(is) | ||
} | ||
} | ||
|
||
/** | ||
* Deserializes data coming out of a pipe from an invoked process. | ||
* | ||
* @tparam T The type of records being formatted. | ||
*/ | ||
trait OutFormatter[T] extends Serializable { | ||
|
||
/** | ||
* Reads an iterator of records from an input stream. | ||
* | ||
* @param is The input stream coming from a process to read records from. | ||
* @return Returns an iterator of records that have been read from this | ||
* stream. | ||
*/ | ||
def read(is: InputStream): Iterator[T] | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
77 changes: 77 additions & 0 deletions
77
adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
/** | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.bdgenomics.adam.rdd.read | ||
|
||
import htsjdk.samtools.{ SAMFileHeader, SAMFileWriter } | ||
import java.io.OutputStream | ||
import org.bdgenomics.adam.converters.AlignmentRecordConverter | ||
import org.bdgenomics.adam.models.{ | ||
RecordGroupDictionary, | ||
SAMFileHeaderWritable | ||
} | ||
import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion } | ||
import org.bdgenomics.formats.avro.AlignmentRecord | ||
|
||
trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterCompanion[AlignmentRecord, AlignmentRecordRDD, T] { | ||
protected def makeFormatter(header: SAMFileHeaderWritable, | ||
recordGroups: RecordGroupDictionary, | ||
converter: AlignmentRecordConverter): T | ||
|
||
def apply(gRdd: AlignmentRecordRDD): T = { | ||
|
||
// make a converter | ||
val arc = new AlignmentRecordConverter | ||
|
||
// build a header and set the sort order | ||
val header = arc.createSAMHeader(gRdd.sequences, gRdd.recordGroups) | ||
header.setSortOrder(SAMFileHeader.SortOrder.coordinate) | ||
|
||
// construct the in formatter | ||
makeFormatter(new SAMFileHeaderWritable(header), gRdd.recordGroups, arc) | ||
} | ||
} | ||
|
||
private[read] trait AnySAMInFormatter[T <: AnySAMInFormatter[T]] extends InFormatter[AlignmentRecord, AlignmentRecordRDD, T] { | ||
|
||
val header: SAMFileHeaderWritable | ||
val recordGroups: RecordGroupDictionary | ||
val converter: AlignmentRecordConverter | ||
|
||
protected def makeWriter(os: OutputStream): SAMFileWriter | ||
|
||
/** | ||
* Writes alignment records to an output stream in SAM format. | ||
* | ||
* @param os An OutputStream connected to a process we are piping to. | ||
* @param iter An iterator of records to write. | ||
*/ | ||
def write(os: OutputStream, iter: Iterator[AlignmentRecord]) { | ||
|
||
// create a sam file writer connected to the output stream | ||
val writer = makeWriter(os) | ||
|
||
// write the records | ||
iter.foreach(r => { | ||
val samRecord = converter.convert(r, header, recordGroups) | ||
writer.addAlignment(samRecord) | ||
}) | ||
|
||
// close the writer, else stream may be defective | ||
writer.close() | ||
} | ||
} |
Oops, something went wrong.