Skip to content

Commit

Permalink
[ADAM-538] Add support for an adam-python API.
Browse files Browse the repository at this point in the history
Resolves bigdatagenomics#538. Adds support for Python APIs that use the ADAM Java API to make
the ADAMContext and RDD functions accessible natively through python.
  • Loading branch information
fnothaft committed Feb 15, 2017
1 parent a0142ec commit ad8d0e2
Show file tree
Hide file tree
Showing 23 changed files with 1,746 additions and 8 deletions.
Expand Up @@ -54,7 +54,7 @@ object ConsensusGenerator {
* present in a single aligned read back into the reference sequence where
* they are aligned.
*/
def fromReads: ConsensusGenerator = {
def fromReads(): ConsensusGenerator = {
new ConsensusGeneratorFromReads
}

Expand Down
24 changes: 21 additions & 3 deletions adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala
Expand Up @@ -17,10 +17,11 @@
*/
package org.bdgenomics.adam.models

import org.bdgenomics.adam.rich.RichVariant
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.rdd.variant.VariantRDD
import org.bdgenomics.adam.rich.DecadentRead._
import org.bdgenomics.adam.rich.RichVariant
import org.bdgenomics.utils.misc.Logging
import org.apache.spark.rdd.RDD
import scala.collection.immutable._
import scala.collection.mutable

Expand Down Expand Up @@ -87,7 +88,24 @@ object SnpTable {
*/
def apply(variants: RDD[RichVariant]): SnpTable = {
val positions = variants.map(variant => (variant.variant.getContigName,
variant.variant.getStart)).collect()
variant.variant.getStart: Long))
fromPos(positions)
}

/**
* Creates a SNP Table from a VariantRDD.
*
* @param variants The variants to populate the table from.
* @return Returns a new SNPTable containing the input variants.
*/
def apply(variants: VariantRDD): SnpTable = {
val positions = variants.rdd.map(variant => (variant.getContigName,
variant.getStart: Long))
fromPos(positions)
}

private def fromPos(rdd: RDD[(String, Long)]): SnpTable = {
val positions = rdd.collect()
val table = new mutable.HashMap[String, mutable.HashSet[Long]]
positions.foreach(tup => table.getOrElseUpdate(tup._1, { new mutable.HashSet[Long] }) += tup._2)
new SnpTable(table.mapValues(_.toSet).toMap)
Expand Down
Expand Up @@ -50,6 +50,7 @@ import org.bdgenomics.adam.rdd.feature.CoverageRDD
import org.bdgenomics.adam.rdd.read.realignment.RealignIndels
import org.bdgenomics.adam.rdd.read.recalibration.BaseQualityRecalibration
import org.bdgenomics.adam.rdd.fragment.FragmentRDD
import org.bdgenomics.adam.rdd.variant.VariantRDD
import org.bdgenomics.adam.serialization.AvroSerializer
import org.bdgenomics.adam.util.ReferenceFile
import org.bdgenomics.formats.avro._
Expand Down Expand Up @@ -664,9 +665,10 @@ case class AlignmentRecordRDD(
* @return Returns an RDD of recalibrated reads.
*/
def recalibrateBaseQualities(
knownSnps: SnpTable,
knownSnps: VariantRDD,
validationStringency: ValidationStringency): AlignmentRecordRDD = {
val bcastSnps = rdd.context.broadcast(knownSnps)
val snpTable = SnpTable(knownSnps)
val bcastSnps = rdd.context.broadcast(snpTable)
recalibrateBaseQualities(bcastSnps, validationStringency = validationStringency)
}

Expand All @@ -687,7 +689,7 @@ case class AlignmentRecordRDD(
}

/**
* Realigns indels using a concensus-based heuristic.
* Realigns indels using a consensus-based heuristic.
*
* Java friendly variant.
*
Expand Down
Expand Up @@ -299,7 +299,6 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite {
assert(readB.getQual == "B" * readB.getSequence.length)
assert(readB.getQual == readC.getQual)
}

}

sparkTest("round trip from ADAM to FASTQ and back to ADAM produces equivalent Read values") {
Expand Down
2 changes: 2 additions & 0 deletions adam-python/.gitignore
@@ -0,0 +1,2 @@
*.pyc
.cache
135 changes: 135 additions & 0 deletions adam-python/Makefile
@@ -0,0 +1,135 @@
#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

define help

Supported targets: prepare, develop, sdist, clean, test, and pypi.

Please note that all build targets require a virtualenv to be active.

The 'prepare' target installs ADAM's build requirements into the current virtualenv.

The 'develop' target creates an editable install of ADAM and its runtime requirements in the
current virtualenv. The install is called 'editable' because changes to the source code
immediately affect the virtualenv.

The 'clean' target undoes the effect of 'develop'.

The 'test' target runs ADAM's unit tests. Set the 'tests' variable to run a particular test, e.g.

make test tests=src/adam/test/sort/sortTest.py::SortTest::testSort

The 'pypi' target publishes the current commit of ADAM to PyPI after enforcing that the working
copy and the index are clean, and tagging it as an unstable .dev build.

endef
export help
help:
@printf "$$help"

# This Makefile uses bash features like printf and <()
SHELL=bash
python=python2.7
pip=pip2.7
tests=src
extras=
adam_version:=$(shell $(python) version.py)
sdist_name:=adam-$(adam_version).tar.gz
current_commit:=$(shell git log --pretty=oneline -n 1 -- $(pwd) | cut -f1 -d " ")
dirty:=$(shell (git diff --exit-code && git diff --cached --exit-code) > /dev/null || printf -- --DIRTY)

green=\033[0;32m
normal=\033[0m\n
red=\033[0;31m


develop: check_venv
$(pip) install -e .$(extras)
clean_develop: check_venv
- $(pip) uninstall -y adam
- rm -rf src/*.egg-info

sdist: dist/$(sdist_name)
dist/$(sdist_name): check_venv
@test -f dist/$(sdist_name) && mv dist/$(sdist_name) dist/$(sdist_name).old || true
$(python) setup.py sdist
@test -f dist/$(sdist_name).old \
&& ( cmp -s <(tar -xOzf dist/$(sdist_name)) <(tar -xOzf dist/$(sdist_name).old) \
&& mv dist/$(sdist_name).old dist/$(sdist_name) \
&& printf "$(green)No significant changes to sdist, reinstating backup.$(normal)" \
|| rm dist/$(sdist_name).old ) \
|| true
clean_sdist:
- rm -rf dist


test: check_venv check_build_reqs
mkdir -p target
$(python) -m pytest -vv --junitxml target/pytest-report.xml $(tests)


pypi: check_venv check_clean_working_copy
set -x \
&& tag_build=`$(python) -c 'pass;\
from version import version as v;\
from pkg_resources import parse_version as pv;\
import os;\
print "--tag-build=.dev" + os.getenv("BUILD_NUMBER") if pv(v).is_prerelease else ""'` \
&& $(python) setup.py egg_info $$tag_build sdist bdist_egg upload
clean_pypi:
- rm -rf build/


clean: clean_develop clean_pypi


check_build_reqs:
@$(python) -c 'import pytest' \
|| ( printf "$(red)Build requirements are missing. Run 'make prepare' to install them.$(normal)" ; false )


prepare: check_venv
$(pip) install pytest==2.8.3


check_venv:
@$(python) -c 'import sys; sys.exit( int( not hasattr(sys, "real_prefix") ) )' \
|| ( printf "$(red)A virtualenv must be active.$(normal)" ; false )


check_clean_working_copy:
@printf "$(green)Checking if your working copy is clean ...$(normal)"
@git diff --exit-code > /dev/null \
|| ( printf "$(red)Your working copy looks dirty.$(normal)" ; false )
@git diff --cached --exit-code > /dev/null \
|| ( printf "$(red)Your index looks dirty.$(normal)" ; false )
@test -z "$$(git ls-files --other --exclude-standard --directory)" \
|| ( printf "$(red)You have are untracked files:$(normal)" \
; git ls-files --other --exclude-standard --directory \
; false )

.PHONY: help \
prepare \
develop clean_develop \
sdist clean_sdist \
test \
pypi clean_pypi \
clean \
check_venv \
check_clean_working_copy \
check_build_reqs
55 changes: 55 additions & 0 deletions adam-python/pom.xml
@@ -0,0 +1,55 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.bdgenomics.adam</groupId>
<artifactId>adam-parent_2.10</artifactId>
<version>0.21.1-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

<artifactId>adam-python_2.10</artifactId>
<packaging>jar</packaging>
<name>ADAM_${scala.version.prefix}: Python APIs</name>
<properties>
<timestamp>${maven.build.timestamp}</timestamp>
<maven.build.timestamp.format>yyyy-MM-dd</maven.build.timestamp.format>
</properties>

<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>dev-python</id>
<phase>process-resources</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>make</executable>
<arguments>
<argument>develop</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>test-python</id>
<phase>process-test-resources</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>make</executable>
<arguments>
<argument>test</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
32 changes: 32 additions & 0 deletions adam-python/setup.py
@@ -0,0 +1,32 @@
#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from setuptools import find_packages, setup
from version import version as adam_version

setup(
name='bdgenomics.adam',
version=adam_version,
description='A fast, scalable genome analysis system',
author='Frank Austin Nothaft',
author_email='fnothaft@berkeley.edu',
url="https://github.com/bdgenomics/adam",
install_requires=[],
package_dir={'': 'src'},
packages=find_packages(where='src',
exclude=['*.test.*']))
17 changes: 17 additions & 0 deletions adam-python/src/bdgenomics/__init__.py
@@ -0,0 +1,17 @@
#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
1 change: 1 addition & 0 deletions adam-python/src/bdgenomics/adam/.gitignore
@@ -0,0 +1 @@
schemas.py
17 changes: 17 additions & 0 deletions adam-python/src/bdgenomics/adam/__init__.py
@@ -0,0 +1,17 @@
#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

0 comments on commit ad8d0e2

Please sign in to comment.