Skip to content

Commit

Permalink
[ADAM-1838] Make populating variant.annotation field in Genotype conf…
Browse files Browse the repository at this point in the history
…igurable.

Resolves bigdatagenomics#1838. Modifies the behavior of bigdatagenomics#1771, which disabled populating the
`variant.annotation` field in the `Genotype` record. Now, this field is not
populated by default. To enable populating it, a user can set the property
`org.bdgenomics.adam.converters.VariantContextConverter.NEST_ANN_IN_GENOTYPES`
to true.
  • Loading branch information
fnothaft committed Dec 19, 2017
1 parent 66c53e3 commit 97be371
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 53 deletions.
Expand Up @@ -40,6 +40,7 @@ import htsjdk.variant.vcf.{
VCFInfoHeaderLine
}
import java.util.Collections
import org.apache.hadoop.conf.Configuration
import org.bdgenomics.utils.misc.{ Logging, MathUtils }
import org.bdgenomics.adam.models.{
SequenceDictionary,
Expand All @@ -62,6 +63,36 @@ import scala.collection.mutable.{ Buffer, HashMap }
*/
object VariantContextConverter {

/**
* If set to true, this property will ensure that the variant.annotation field
* in the Genotype record is populated after conversion from an htsjdk
* VariantContext. By default, this property is false.
*/
val nestAnnotationInGenotypesProperty = "org.bdgenomics.adam.converters.VariantContextConverter.NEST_ANN_IN_GENOTYPES"

/**
* Sets the value of the nest annotation in genotypes property.
*
* @param conf Hadoop configuration to set the property in.
* @param populateNestedAnn If true, the nested field is populated.
*/
def setNestAnnotationInGenotypesProperty(conf: Configuration,
populateNestedAnn: Boolean) {
conf.setBoolean(nestAnnotationInGenotypesProperty, populateNestedAnn)
}

/**
* Gets the value of the nest annotation in genotypes property.
*
* @param conf Hadoop configuration to set the property in.
* @return Returns whether or not to nest the variant annotation under each
* genotype record.
*/
private[adam] def getNestAnnotationInGenotypesProperty(
conf: Configuration): Boolean = {
conf.getBoolean(nestAnnotationInGenotypesProperty, false)
}

/**
* Representation for an unknown non-ref/symbolic allele in VCF.
*/
Expand Down Expand Up @@ -238,6 +269,14 @@ object VariantContextConverter {
header.getInfoHeaderLines ++
header.getOtherHeaderLines).toSeq
}

def apply(headerLines: Seq[VCFHeaderLine],
stringency: ValidationStringency,
conf: Configuration): VariantContextConverter = {
new VariantContextConverter(headerLines,
stringency,
getNestAnnotationInGenotypesProperty(conf))
}
}

/**
Expand All @@ -252,7 +291,8 @@ object VariantContextConverter {
*/
class VariantContextConverter(
headerLines: Seq[VCFHeaderLine],
stringency: ValidationStringency) extends Serializable with Logging {
stringency: ValidationStringency,
setNestedAnnotationInGenotype: Boolean) extends Serializable with Logging {
import VariantContextConverter._

// format fns gatk --> bdg, extract fns bdg --> gatk
Expand All @@ -277,6 +317,15 @@ class VariantContextConverter(
*/
private def jDouble(f: Double): java.lang.Double = f

private def genotypeVariant(coreVariant: Variant,
fullVariant: Variant): Variant = {
if (setNestedAnnotationInGenotype) {
fullVariant
} else {
coreVariant
}
}

/**
* Converts a GATK variant context into one or more ADAM variant context(s).
*
Expand All @@ -290,8 +339,9 @@ class VariantContextConverter(
vc.getAlternateAlleles.toList match {
case List(NON_REF_ALLELE) | List() => {
val (coreVariant, variant) = variantFormatFn(vc, None, 0, false)
val v = genotypeVariant(coreVariant, variant)
val genotypes = vc.getGenotypes.map(g => {
genotypeFormatFn(g, coreVariant, NON_REF_ALLELE, 0, Some(1), false)
genotypeFormatFn(g, v, NON_REF_ALLELE, 0, Some(1), false)
})
return Seq(ADAMVariantContext(variant, genotypes))
}
Expand All @@ -301,8 +351,9 @@ class VariantContextConverter(
"Assertion failed when converting: " + vc.toString
)
val (coreVariant, variant) = variantFormatFn(vc, Some(allele.getDisplayString), 0, false)
val v = genotypeVariant(coreVariant, variant)
val genotypes = vc.getGenotypes.map(g => {
genotypeFormatFn(g, coreVariant, allele, 1, None, false)
genotypeFormatFn(g, v, allele, 1, None, false)
})
return Seq(ADAMVariantContext(variant, genotypes))
}
Expand All @@ -312,8 +363,9 @@ class VariantContextConverter(
"Assertion failed when converting: " + vc.toString
)
val (coreVariant, variant) = variantFormatFn(vc, Some(allele.getDisplayString), 0, false)
val v = genotypeVariant(coreVariant, variant)
val genotypes = vc.getGenotypes.map(g => {
genotypeFormatFn(g, coreVariant, allele, 1, Some(2), false)
genotypeFormatFn(g, v, allele, 1, Some(2), false)
})
return Seq(ADAMVariantContext(variant, genotypes))
}
Expand Down Expand Up @@ -344,8 +396,9 @@ class VariantContextConverter(
Some(allele.getDisplayString),
variantIdx,
true)
val v = genotypeVariant(coreVariant, variant)
val genotypes = vc.getGenotypes.map(g => {
genotypeFormatFn(g, coreVariant, allele, idx, referenceModelIndex, true)
genotypeFormatFn(g, v, allele, idx, referenceModelIndex, true)
})
ADAMVariantContext(variant, genotypes)
})
Expand Down
Expand Up @@ -2046,7 +2046,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
// load vcf metadata
val (sd, samples, headers) = loadVcfMetadata(pathName)

val vcc = new VariantContextConverter(headers, stringency)
val vcc = VariantContextConverter(headers, stringency, sc.hadoopConfiguration)
VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)),
sd,
samples,
Expand Down Expand Up @@ -2097,7 +2097,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
// load vcf metadata
val (sd, samples, headers) = loadVcfMetadata(pathName)

val vcc = new VariantContextConverter(headers.flatMap(hl => hl match {
val vcc = VariantContextConverter(headers.flatMap(hl => hl match {
case il: VCFInfoHeaderLine => {
if (infoFields(il.getID)) {
Some(il)
Expand All @@ -2113,7 +2113,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
}
}
case _ => None
}), stringency)
}), stringency, sc.hadoopConfiguration)
VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)),
sd,
samples,
Expand Down Expand Up @@ -2159,7 +2159,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
// load vcf metadata
val (sd, samples, headers) = loadVcfMetadata(pathName)

val vcc = new VariantContextConverter(headers, stringency)
val vcc = VariantContextConverter(headers, stringency, sc.hadoopConfiguration)
VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)),
sd,
samples,
Expand Down
Expand Up @@ -24,6 +24,7 @@ import htsjdk.variant.variantcontext.writer.{
}
import htsjdk.variant.vcf.{ VCFHeader, VCFHeaderLine }
import java.io.OutputStream
import org.apache.hadoop.conf.Configuration
import org.bdgenomics.adam.converters.VariantContextConverter
import org.bdgenomics.adam.models.{
SequenceDictionary,
Expand All @@ -47,20 +48,23 @@ object VCFInFormatter extends InFormatterCompanion[VariantContext, VariantContex
def apply(gRdd: VariantContextRDD): VCFInFormatter = {
VCFInFormatter(gRdd.sequences,
gRdd.samples.map(_.getSampleId),
gRdd.headerLines)
gRdd.headerLines,
gRdd.rdd.context.hadoopConfiguration)
}
}

case class VCFInFormatter private (
sequences: SequenceDictionary,
samples: Seq[String],
headerLines: Seq[VCFHeaderLine]) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] {
headerLines: Seq[VCFHeaderLine],
@transient val conf: Configuration) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] {

protected val companion = VCFInFormatter

// make a converter
val converter = new VariantContextConverter(headerLines,
ValidationStringency.LENIENT)
val converter = VariantContextConverter(headerLines,
ValidationStringency.LENIENT,
conf)

/**
* Writes variant contexts to an output stream in VCF format.
Expand Down
Expand Up @@ -28,6 +28,7 @@ import htsjdk.tribble.readers.{
AsciiLineReaderIterator
}
import java.io.InputStream
import org.apache.hadoop.conf.Configuration
import org.bdgenomics.adam.converters.VariantContextConverter._
import org.bdgenomics.adam.converters.VariantContextConverter
import org.bdgenomics.adam.models.VariantContext
Expand All @@ -39,7 +40,10 @@ import scala.collection.mutable.ListBuffer
/**
* OutFormatter that reads streaming VCF.
*/
case class VCFOutFormatter() extends OutFormatter[VariantContext] with Logging {
case class VCFOutFormatter(
@transient conf: Configuration) extends OutFormatter[VariantContext] with Logging {

private val nestAnn = VariantContextConverter.getNestAnnotationInGenotypesProperty(conf)

/**
* Reads VariantContexts from an input stream. Autodetects VCF format.
Expand All @@ -62,7 +66,9 @@ case class VCFOutFormatter() extends OutFormatter[VariantContext] with Logging {
val lines = cleanAndMixInSupportedLines(headerLines(header), ValidationStringency.LENIENT, log)

// make converter
val converter = new VariantContextConverter(lines, ValidationStringency.LENIENT)
val converter = new VariantContextConverter(lines,
ValidationStringency.LENIENT,
nestAnn)

@tailrec def convertIterator(iter: AsciiLineReaderIterator,
records: ListBuffer[VariantContext] = ListBuffer.empty): Iterator[VariantContext] = {
Expand Down
Expand Up @@ -280,7 +280,9 @@ case class VariantContextRDD(rdd: RDD[VariantContext],
val sampleIds = samples.map(_.getSampleId)

// convert the variants to htsjdk VCs
val converter = new VariantContextConverter(headerLines, stringency)
val converter = VariantContextConverter(headerLines,
stringency,
rdd.context.hadoopConfiguration)
val writableVCs: RDD[(LongWritable, VariantContextWritable)] = rdd.flatMap(vc => {
converter.convert(vc)
.map(htsjdkVc => {
Expand Down

0 comments on commit 97be371

Please sign in to comment.