[ADAM-1838] Make populating variant.annotation field in Genotype conf…

…igurable. Resolves bigdatagenomics#1838. Modifies the behavior of bigdatagenomics#1771, which disabled populating the `variant.annotation` field in the `Genotype` record. Now, this field is not populated by default. To enable populating it, a user can set the property `org.bdgenomics.adam.converters.VariantContextConverter.NEST_ANN_IN_GENOTYPES` to true.
fnothaft · Dec 19, 2017 · 97be371 · 97be371
1 parent 66c53e3
commit 97be371
Show file tree

Hide file tree

Showing 9 changed files with 119 additions and 53 deletions.
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala
@@ -40,6 +40,7 @@ import htsjdk.variant.vcf.{
   VCFInfoHeaderLine
 }
 import java.util.Collections
+import org.apache.hadoop.conf.Configuration
 import org.bdgenomics.utils.misc.{ Logging, MathUtils }
 import org.bdgenomics.adam.models.{
   SequenceDictionary,
@@ -62,6 +63,36 @@ import scala.collection.mutable.{ Buffer, HashMap }
  */
 object VariantContextConverter {
 
+  /**
+   * If set to true, this property will ensure that the variant.annotation field
+   * in the Genotype record is populated after conversion from an htsjdk
+   * VariantContext. By default, this property is false.
+   */
+  val nestAnnotationInGenotypesProperty = "org.bdgenomics.adam.converters.VariantContextConverter.NEST_ANN_IN_GENOTYPES"
+
+  /**
+   * Sets the value of the nest annotation in genotypes property.
+   *
+   * @param conf Hadoop configuration to set the property in.
+   * @param populateNestedAnn If true, the nested field is populated.
+   */
+  def setNestAnnotationInGenotypesProperty(conf: Configuration,
+                                           populateNestedAnn: Boolean) {
+    conf.setBoolean(nestAnnotationInGenotypesProperty, populateNestedAnn)
+  }
+
+  /**
+   * Gets the value of the nest annotation in genotypes property.
+   *
+   * @param conf Hadoop configuration to set the property in.
+   * @return Returns whether or not to nest the variant annotation under each
+   *   genotype record.
+   */
+  private[adam] def getNestAnnotationInGenotypesProperty(
+    conf: Configuration): Boolean = {
+    conf.getBoolean(nestAnnotationInGenotypesProperty, false)
+  }
+
   /**
    * Representation for an unknown non-ref/symbolic allele in VCF.
    */
@@ -238,6 +269,14 @@ object VariantContextConverter {
       header.getInfoHeaderLines ++
       header.getOtherHeaderLines).toSeq
   }
+
+  def apply(headerLines: Seq[VCFHeaderLine],
+            stringency: ValidationStringency,
+            conf: Configuration): VariantContextConverter = {
+    new VariantContextConverter(headerLines,
+      stringency,
+      getNestAnnotationInGenotypesProperty(conf))
+  }
 }
 
 /**
@@ -252,7 +291,8 @@ object VariantContextConverter {
  */
 class VariantContextConverter(
     headerLines: Seq[VCFHeaderLine],
-    stringency: ValidationStringency) extends Serializable with Logging {
+    stringency: ValidationStringency,
+    setNestedAnnotationInGenotype: Boolean) extends Serializable with Logging {
   import VariantContextConverter._
 
   // format fns gatk --> bdg, extract fns bdg --> gatk
@@ -277,6 +317,15 @@ class VariantContextConverter(
    */
   private def jDouble(f: Double): java.lang.Double = f
 
+  private def genotypeVariant(coreVariant: Variant,
+                              fullVariant: Variant): Variant = {
+    if (setNestedAnnotationInGenotype) {
+      fullVariant
+    } else {
+      coreVariant
+    }
+  }
+
   /**
    * Converts a GATK variant context into one or more ADAM variant context(s).
    *
@@ -290,8 +339,9 @@ class VariantContextConverter(
       vc.getAlternateAlleles.toList match {
         case List(NON_REF_ALLELE) | List() => {
           val (coreVariant, variant) = variantFormatFn(vc, None, 0, false)
+          val v = genotypeVariant(coreVariant, variant)
           val genotypes = vc.getGenotypes.map(g => {
-            genotypeFormatFn(g, coreVariant, NON_REF_ALLELE, 0, Some(1), false)
+            genotypeFormatFn(g, v, NON_REF_ALLELE, 0, Some(1), false)
           })
           return Seq(ADAMVariantContext(variant, genotypes))
         }
@@ -301,8 +351,9 @@ class VariantContextConverter(
             "Assertion failed when converting: " + vc.toString
           )
           val (coreVariant, variant) = variantFormatFn(vc, Some(allele.getDisplayString), 0, false)
+          val v = genotypeVariant(coreVariant, variant)
           val genotypes = vc.getGenotypes.map(g => {
-            genotypeFormatFn(g, coreVariant, allele, 1, None, false)
+            genotypeFormatFn(g, v, allele, 1, None, false)
           })
           return Seq(ADAMVariantContext(variant, genotypes))
         }
@@ -312,8 +363,9 @@ class VariantContextConverter(
             "Assertion failed when converting: " + vc.toString
           )
           val (coreVariant, variant) = variantFormatFn(vc, Some(allele.getDisplayString), 0, false)
+          val v = genotypeVariant(coreVariant, variant)
           val genotypes = vc.getGenotypes.map(g => {
-            genotypeFormatFn(g, coreVariant, allele, 1, Some(2), false)
+            genotypeFormatFn(g, v, allele, 1, Some(2), false)
           })
           return Seq(ADAMVariantContext(variant, genotypes))
         }
@@ -344,8 +396,9 @@ class VariantContextConverter(
               Some(allele.getDisplayString),
               variantIdx,
               true)
+            val v = genotypeVariant(coreVariant, variant)
             val genotypes = vc.getGenotypes.map(g => {
-              genotypeFormatFn(g, coreVariant, allele, idx, referenceModelIndex, true)
+              genotypeFormatFn(g, v, allele, idx, referenceModelIndex, true)
             })
             ADAMVariantContext(variant, genotypes)
           })

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala
@@ -2046,7 +2046,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
     // load vcf metadata
     val (sd, samples, headers) = loadVcfMetadata(pathName)
 
-    val vcc = new VariantContextConverter(headers, stringency)
+    val vcc = VariantContextConverter(headers, stringency, sc.hadoopConfiguration)
     VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)),
       sd,
       samples,
@@ -2097,7 +2097,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
     // load vcf metadata
     val (sd, samples, headers) = loadVcfMetadata(pathName)
 
-    val vcc = new VariantContextConverter(headers.flatMap(hl => hl match {
+    val vcc = VariantContextConverter(headers.flatMap(hl => hl match {
       case il: VCFInfoHeaderLine => {
         if (infoFields(il.getID)) {
           Some(il)
@@ -2113,7 +2113,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
         }
       }
       case _ => None
-    }), stringency)
+    }), stringency, sc.hadoopConfiguration)
     VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)),
       sd,
       samples,
@@ -2159,7 +2159,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
     // load vcf metadata
     val (sd, samples, headers) = loadVcfMetadata(pathName)
 
-    val vcc = new VariantContextConverter(headers, stringency)
+    val vcc = VariantContextConverter(headers, stringency, sc.hadoopConfiguration)
     VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)),
       sd,
       samples,

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala
@@ -24,6 +24,7 @@ import htsjdk.variant.variantcontext.writer.{
 }
 import htsjdk.variant.vcf.{ VCFHeader, VCFHeaderLine }
 import java.io.OutputStream
+import org.apache.hadoop.conf.Configuration
 import org.bdgenomics.adam.converters.VariantContextConverter
 import org.bdgenomics.adam.models.{
   SequenceDictionary,
@@ -47,20 +48,23 @@ object VCFInFormatter extends InFormatterCompanion[VariantContext, VariantContex
   def apply(gRdd: VariantContextRDD): VCFInFormatter = {
     VCFInFormatter(gRdd.sequences,
       gRdd.samples.map(_.getSampleId),
-      gRdd.headerLines)
+      gRdd.headerLines,
+      gRdd.rdd.context.hadoopConfiguration)
   }
 }
 
 case class VCFInFormatter private (
     sequences: SequenceDictionary,
     samples: Seq[String],
-    headerLines: Seq[VCFHeaderLine]) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] {
+    headerLines: Seq[VCFHeaderLine],
+    @transient val conf: Configuration) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] {
 
   protected val companion = VCFInFormatter
 
   // make a converter
-  val converter = new VariantContextConverter(headerLines,
-    ValidationStringency.LENIENT)
+  val converter = VariantContextConverter(headerLines,
+    ValidationStringency.LENIENT,
+    conf)
 
   /**
    * Writes variant contexts to an output stream in VCF format.

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFOutFormatter.scala
@@ -28,6 +28,7 @@ import htsjdk.tribble.readers.{
   AsciiLineReaderIterator
 }
 import java.io.InputStream
+import org.apache.hadoop.conf.Configuration
 import org.bdgenomics.adam.converters.VariantContextConverter._
 import org.bdgenomics.adam.converters.VariantContextConverter
 import org.bdgenomics.adam.models.VariantContext
@@ -39,7 +40,10 @@ import scala.collection.mutable.ListBuffer
 /**
  * OutFormatter that reads streaming VCF.
  */
-case class VCFOutFormatter() extends OutFormatter[VariantContext] with Logging {
+case class VCFOutFormatter(
+    @transient conf: Configuration) extends OutFormatter[VariantContext] with Logging {
+
+  private val nestAnn = VariantContextConverter.getNestAnnotationInGenotypesProperty(conf)
 
   /**
    * Reads VariantContexts from an input stream. Autodetects VCF format.
@@ -62,7 +66,9 @@ case class VCFOutFormatter() extends OutFormatter[VariantContext] with Logging {
     val lines = cleanAndMixInSupportedLines(headerLines(header), ValidationStringency.LENIENT, log)
 
     // make converter
-    val converter = new VariantContextConverter(lines, ValidationStringency.LENIENT)
+    val converter = new VariantContextConverter(lines,
+      ValidationStringency.LENIENT,
+      nestAnn)
 
     @tailrec def convertIterator(iter: AsciiLineReaderIterator,
                                  records: ListBuffer[VariantContext] = ListBuffer.empty): Iterator[VariantContext] = {

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala
@@ -280,7 +280,9 @@ case class VariantContextRDD(rdd: RDD[VariantContext],
     val sampleIds = samples.map(_.getSampleId)
 
     // convert the variants to htsjdk VCs
-    val converter = new VariantContextConverter(headerLines, stringency)
+    val converter = VariantContextConverter(headerLines,
+      stringency,
+      rdd.context.hadoopConfiguration)
     val writableVCs: RDD[(LongWritable, VariantContextWritable)] = rdd.flatMap(vc => {
       converter.convert(vc)
         .map(htsjdkVc => {