[ADAM-1018] Add support for Spark SQL Datasets.

Resolves bigdatagenomics#1018. Adds the `adam-codegen` module, which generates classes that: 1. Implement the Scala Product interface and thus can be read into a Spark SQL Dataset. 2. Have a complete constructor that is compatible with the constructor that Spark SQL expects to see when exporting a Dataset back to Scala. 3. And, that have methods for converting to/from the bdg-formats Avro models. Then, we build these model classes in the `org.bdgenomics.adam.sql` package, and use them for export from the Avro based GenomicRDDs. With a Dataset, we can then export to a DataFrame, which enables us to expose data through Python via RDD->Dataset->DataFrame. This is important since the Avro classes generated by bdg-formats can't be pickled, and thus we can't do a Java RDD to Python RDD crossing with them.
fnothaft · Jun 26, 2017 · dc7a254 · dc7a254
1 parent f3290bc
commit dc7a254
Show file tree

Hide file tree

Showing 30 changed files with 1,706 additions and 206 deletions.
diff --git a/adam-apis/pom.xml b/adam-apis/pom.xml
@@ -142,5 +142,9 @@
       <artifactId>scalatest_${scala.version.prefix}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.version.prefix}</artifactId>
+    </dependency>
   </dependencies>
 </project>
diff --git a/adam-cli/pom.xml b/adam-cli/pom.xml
@@ -193,5 +193,9 @@
       <artifactId>scala-guice_${scala.version.prefix}</artifactId>
       <scope>compile</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.version.prefix}</artifactId>
+    </dependency>
   </dependencies>
 </project>
diff --git a/adam-codegen/pom.xml b/adam-codegen/pom.xml
@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.bdgenomics.adam</groupId>
+    <artifactId>adam-parent_2.10</artifactId>
+    <version>0.23.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>adam-codegen_2.10</artifactId>
+  <packaging>jar</packaging>
+  <name>ADAM_${scala.version.prefix}: Avro-to-Dataset codegen utils</name>
+  <properties>
+    <timestamp>${maven.build.timestamp}</timestamp>
+    <maven.build.timestamp.format>yyyy-MM-dd</maven.build.timestamp.format>
+  </properties>
+  <build>
+    <plugins>
+      <!-- disable surefire -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <skipTests>true</skipTests>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>build-helper-maven-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>add-source</id>
+            <phase>generate-sources</phase>
+            <goals>
+              <goal>add-source</goal>
+            </goals>
+            <configuration>
+              <sources>
+                <source>src/main/scala</source>
+              </sources>
+            </configuration>
+          </execution>
+          <execution>
+            <id>add-test-source</id>
+            <phase>generate-test-sources</phase>
+            <goals>
+              <goal>add-test-source</goal>
+            </goals>
+            <configuration>
+              <sources>
+                <source>src/test/scala</source>
+              </sources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+        <configuration>
+          <reportsDirectory>${project.build.directory}/scalatest-reports</reportsDirectory>
+          <junitxml>.</junitxml>
+          <filereports>ADAMTestSuite.txt</filereports>
+          <!--
+              As explained here: http://stackoverflow.com/questions/1660441/java-flag-to-enable-extended-serialization-debugging-info
+              The second option allows us better debugging for serialization-based errors.
+            -->
+          <argLine>-Xmx1024m -Dsun.io.serialization.extendedDebugInfo=true</argLine>
+          <stdout>F</stdout>
+        </configuration>
+        <executions>
+          <execution>
+            <id>test</id>
+            <goals>
+              <goal>test</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.version.prefix}</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <profiles>
+    <profile>
+      <id>coverage</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.scoverage</groupId>
+            <artifactId>scoverage-maven-plugin</artifactId>
+            <configuration>
+              <excludedPackages>org.bdgenomics.adam.codegen</excludedPackages>
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+  </profiles>
+</project>