612 taxon id (#1513)

* changed server, need to update tests * changed all references to taxonId * client tests not finding species * Ontology term updates * constraints not working * code changed, need doc and compliance changes * updated docs * fixed json decoding error * Move client requirement to master
ga4gh · Feb 2, 2017 · f7a9990 · f7a9990
1 parent c14ccad
commit f7a9990
Show file tree

Hide file tree

Showing 11 changed files with 117 additions and 1,235 deletions.
diff --git a/docs/datarepo.rst b/docs/datarepo.rst
@@ -136,7 +136,7 @@ Adds a reference set derived from a FASTA file to a repository. Each
 record in the FASTA file will correspond to a Reference in the new
 ReferenceSet. The input FASTA file must be compressed with ``bgzip``
 and indexed using ``samtools faidx``. Each ReferenceSet contains a
-number of metadata values (.e.g. ``ncbiTaxonId``) which can be set
+number of metadata values (.e.g. ``species``) which can be set
 using command line options.
 
 .. argparse::
@@ -152,11 +152,12 @@ using command line options.
 
     $ ga4gh_repo add-referenceset registry.db hs37d5.fa.gz \
         --description "NCBI37 assembly of the human genome" \
-        --ncbiTaxonId 9606 --name NCBI37 \
+        --species '{"id": "9606", "term": "Homo sapiens", "source_name": "NCBI", "source_version: "1.0"}' \
+        --name NCBI37 \
         --sourceUri ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
 
 Adds a reference set used in the 1000 Genomes project using the name
-``NCBI37``, also setting the ``ncbiTaxonId`` to 9606 (human).
+``NCBI37``, also setting the ``species`` to 9606 (human).
 
 -------------
 add-biosample

diff --git a/docs/demo.rst b/docs/demo.rst
@@ -273,7 +273,9 @@ around 3GB. Next, we will add the reference set.
 .. code-block:: bash
 
     $ ga4gh_repo add-referenceset registry.db /full/path/to/hs37d5.fa.gz \
-      -d "NCBI37 assembly of the human genome" --ncbiTaxonId 9606 --name NCBI37 \
+      -d “NCBI37 assembly of the human genome” \
+      --species '{"id": "9606", "term": "Homo sapiens", "source_name": "NCBI", "source_version: "1.0"}' \
+      --name NCBI37 \
       --sourceUri "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
 
 A number of optional command line flags have been added. We will be

diff --git a/ga4gh/server/cli/repomanager.py b/ga4gh/server/cli/repomanager.py
@@ -172,7 +172,8 @@ def addReferenceSet(self):
         referenceSet = references.HtslibReferenceSet(name)
         referenceSet.populateFromFile(filePath)
         referenceSet.setDescription(self._args.description)
-        referenceSet.setNcbiTaxonId(self._args.ncbiTaxonId)
+        if self._args.species is not None:
+            referenceSet.setSpeciesFromJson(self._args.species)
         referenceSet.setIsDerived(self._args.isDerived)
         referenceSet.setAssemblyId(self._args.assemblyId)
         sourceAccessions = []
@@ -766,7 +767,8 @@ def getParser(cls):
         cls.addNameOption(addReferenceSetParser, objectType)
         cls.addDescriptionOption(addReferenceSetParser, objectType)
         addReferenceSetParser.add_argument(
-            "--ncbiTaxonId", default=None, help="The NCBI Taxon Id")
+            "--species", default=None,
+            help="The species ontology term as a JSON string")
         addReferenceSetParser.add_argument(
             "--isDerived", default=False, type=bool,
             help="Indicates if this reference set is derived from another")

diff --git a/ga4gh/server/datamodel/references.py b/ga4gh/server/datamodel/references.py
@@ -42,7 +42,7 @@ def __init__(self, localId):
         self._assemblyId = None
         self._description = None
         self._isDerived = False
-        self._ncbiTaxonId = None
+        self._species = None
         self._sourceAccessions = []
         self._sourceUri = None
 
@@ -61,12 +61,18 @@ def setDescription(self, description):
         """
         self._description = description
 
-    def setNcbiTaxonId(self, ncbiTaxonId):
+    def setSpeciesFromJson(self, speciesJson):
         """
-        Sets the ncbiTaxonId to the specified value. See the documentation
-        for getNcbiTaxonId for details of this field.
+        Sets the species, an OntologyTerm, to the specified value, given as
+        a JSON string.
+
+        See the documentation for details of this field.
         """
-        self._ncbiTaxonId = ncbiTaxonId
+        try:
+            parsed = protocol.fromJson(speciesJson, protocol.OntologyTerm)
+        except:
+            raise exceptions.InvalidJsonException(speciesJson)
+        self._species = protocol.toJsonDict(parsed)
 
     def setIsDerived(self, isDerived):
         """
@@ -176,17 +182,21 @@ def getSourceUri(self):
         """
         return self._sourceUri
 
-    def getNcbiTaxonId(self):
+    def getSpecies(self):
         """
-        Returns the NCBI Taxon ID for this reference set. This is the
-        ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human)
-        indicating the species which this assembly is intended to model.
+        Returns the species for this reference set. This is the
+        ontology term with data from
+        www.obofoundry.org/ontology/ncbitaxon.html
+        (e.g. 9606 for human)
         Note that contained `Reference`s may specify a different
-        `ncbiTaxonId`, as assemblies may contain reference sequences
-        which do not belong to the modeled species, e.g.  EBV in a
+        species, as assemblies may contain reference sequences
+        which do not belong to the modeled species, e.g. EBV in a
         human reference genome.
         """
-        return self._ncbiTaxonId
+        if self._species is not {}:
+            return self._species
+        else:
+            return None
 
     def toProtocolElement(self):
         """
@@ -198,7 +208,13 @@ def toProtocolElement(self):
         ret.id = self.getId()
         ret.is_derived = self.getIsDerived()
         ret.md5checksum = self.getMd5Checksum()
-        ret.ncbi_taxon_id = pb.int(self.getNcbiTaxonId())
+        if self.getSpecies():
+            term = protocol.fromJson(
+                json.dumps(self.getSpecies()), protocol.OntologyTerm)
+            ret.species.id = term.id
+            ret.species.term = term.term
+            ret.species.source_name = term.source_name
+            ret.species.source_version = term.source_version
         ret.source_accessions.extend(self.getSourceAccessions())
         ret.source_uri = pb.string(self.getSourceUri())
         ret.name = self.getLocalId()
@@ -222,20 +238,26 @@ def __init__(self, parentContainer, localId):
         self._sourceAccessions = []
         self._isDerived = False
         self._sourceDivergence = pb.DEFAULT_INT
-        self._ncbiTaxonId = pb.DEFAULT_INT
+        self._species = None
 
     def setMd5checksum(self, md5checksum):
         """
         Sets the md5checksum to the specified value.
         """
         self._md5checksum = md5checksum
 
-    def setNcbiTaxonId(self, ncbiTaxonId):
+    def setSpeciesFromJson(self, speciesJson):
         """
-        Sets the ncbiTaxonId to the specified value. See the documentation
-        for getNcbiTaxonId for details of this field.
+        Sets the species, an OntologyTerm, to the specified value, given as
+        a JSON string.
+
+        See the documentation for details of this field.
         """
-        self._ncbiTaxonId = ncbiTaxonId
+        try:
+            parsed = protocol.fromJson(speciesJson, protocol.OntologyTerm)
+        except:
+            raise exceptions.InvalidJsonException(speciesJson)
+        self._species = protocol.toJsonDict(parsed)
 
     def setSourceAccessions(self, sourceAccessions):
         """
@@ -296,17 +318,21 @@ def getSourceUri(self):
         """
         return self._sourceUri
 
-    def getNcbiTaxonId(self):
+    def getSpecies(self):
         """
-        Returns the NCBI Taxon ID for this reference. This is the
-        ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human)
-        indicating the species which this assembly is intended to model.
+        Returns the species for this reference set. This is the
+        ontology term with data from
+        www.obofoundry.org/ontology/ncbitaxon.html
+        (e.g. 9606 for human)
         Note that contained `Reference`s may specify a different
-        `ncbiTaxonId`, as assemblies may contain reference sequences
-        which do not belong to the modeled species, e.g.  EBV in a
+        species, as assemblies may contain reference sequences
+        which do not belong to the modeled species, e.g. EBV in a
         human reference genome.
         """
-        return self._ncbiTaxonId
+        if self._species is not {}:
+            return self._species
+        else:
+            return None
 
     def getMd5Checksum(self):
         """
@@ -326,7 +352,13 @@ def toProtocolElement(self):
         reference.length = self.getLength()
         reference.md5checksum = self.getMd5Checksum()
         reference.name = self.getName()
-        reference.ncbi_taxon_id = self.getNcbiTaxonId()
+        if self.getSpecies():
+            term = protocol.fromJson(
+                json.dumps(self.getSpecies()), protocol.OntologyTerm)
+            reference.species.id = term.id
+            reference.species.term = term.term
+            reference.species.source_name = term.source_name
+            reference.species.source_version = term.source_version
         reference.source_accessions.extend(self.getSourceAccessions())
         reference.source_divergence = pb.int(self.getSourceDivergence())
         reference.source_uri = self.getSourceUri()
@@ -370,7 +402,9 @@ def __init__(self, localId, randomSeed=0, numReferences=1):
         self._description = "Simulated reference set"
         self._assemblyId = str(random.randint(0, 2**32))
         self._isDerived = bool(random.randint(0, 1))
-        self._ncbiTaxonId = random.randint(0, 2**16)
+        self._species = json.loads(
+                    '{"sourceName": "NCBI", "sourceVersion": "",'
+                    + '"term": "Homo sapiens", "id": "9606"}')
         self._sourceAccessions = []
         for i in range(random.randint(1, 3)):
                 self._sourceAccessions.append("sim_accession_{}".format(
@@ -402,7 +436,9 @@ def __init__(self, parentContainer, localId, randomSeed=0, length=200):
         self._sourceDivergence = 0
         if self._isDerived:
             self._sourceDivergence = rng.uniform(0, 0.1)
-        self._ncbiTaxonId = random.randint(0, 2**16)
+        self._species = json.loads(
+                            '{"sourceName": "NCBI", "sourceVersion": "",'
+                            + '"term": "Homo sapiens", "id": "9606"}')
         self._sourceAccessions = []
         for i in range(random.randint(1, 3)):
                 self._sourceAccessions.append("sim_accession_{}".format(
@@ -455,7 +491,7 @@ def populateFromRow(self, row):
         self._assemblyId = row[b'assemblyId']
         self._isDerived = bool(row[b'isDerived'])
         self._md5checksum = row[b'md5checksum']
-        self._ncbiTaxonId = row[b'ncbiTaxonId']
+        self._species = json.loads(row[b'species'])
         self._sourceAccessions = json.loads(row[b'sourceAccessions'])
         self._sourceUri = row[b'sourceUri']
 
@@ -490,7 +526,7 @@ def populateFromRow(self, row):
         self._length = row[b'length']
         self._isDerived = bool(row[b'isDerived'])
         self._md5checksum = row[b'md5checksum']
-        self._ncbiTaxonId = row[b'ncbiTaxonId']
+        self._species = json.loads(row[b'species'])
         self._sourceAccessions = json.loads(row[b'sourceAccessions'])
         self._sourceDivergence = row[b'sourceDivergence']
         self._sourceUri = row[b'sourceUri']

diff --git a/ga4gh/server/datarepo.py b/ga4gh/server/datarepo.py
@@ -705,7 +705,7 @@ def _createReferenceTable(self, cursor):
                 length INTEGER,
                 isDerived INTEGER,
                 md5checksum TEXT,
-                ncbiTaxonId INTEGER,
+                species TEXT,
                 sourceAccessions TEXT,
                 sourceDivergence REAL,
                 sourceUri TEXT,
@@ -723,15 +723,15 @@ def insertReference(self, reference):
         sql = """
             INSERT INTO Reference (
                 id, referenceSetId, name, length, isDerived, md5checksum,
-                ncbiTaxonId, sourceAccessions, sourceUri)
+                species, sourceAccessions, sourceUri)
             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);
         """
         cursor = self._dbConnection.cursor()
         cursor.execute(sql, (
             reference.getId(), reference.getParentContainer().getId(),
             reference.getLocalId(), reference.getLength(),
             reference.getIsDerived(), reference.getMd5Checksum(),
-            reference.getNcbiTaxonId(),
+            json.dumps(reference.getSpecies()),
             # We store the list of sourceAccessions as a JSON string. Perhaps
             # this should be another table?
             json.dumps(reference.getSourceAccessions()),
@@ -756,7 +756,7 @@ def _createReferenceSetTable(self, cursor):
                 assemblyId TEXT,
                 isDerived INTEGER,
                 md5checksum TEXT,
-                ncbiTaxonId INTEGER,
+                species TEXT,
                 sourceAccessions TEXT,
                 sourceUri TEXT,
                 dataUrl TEXT NOT NULL,
@@ -772,7 +772,7 @@ def insertReferenceSet(self, referenceSet):
         sql = """
             INSERT INTO ReferenceSet (
                 id, name, description, assemblyId, isDerived, md5checksum,
-                ncbiTaxonId, sourceAccessions, sourceUri, dataUrl)
+                species, sourceAccessions, sourceUri, dataUrl)
             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
         """
         cursor = self._dbConnection.cursor()
@@ -781,7 +781,7 @@ def insertReferenceSet(self, referenceSet):
                 referenceSet.getId(), referenceSet.getLocalId(),
                 referenceSet.getDescription(), referenceSet.getAssemblyId(),
                 referenceSet.getIsDerived(), referenceSet.getMd5Checksum(),
-                referenceSet.getNcbiTaxonId(),
+                json.dumps(referenceSet.getSpecies(), protocol.OntologyTerm),
                 # We store the list of sourceAccessions as a JSON string.
                 # Perhaps this should be another table?
                 json.dumps(referenceSet.getSourceAccessions()),

diff --git a/scripts/download_example_data.py b/scripts/download_example_data.py
@@ -346,9 +346,13 @@ def createRepo(self):
         referenceSet = references.HtslibReferenceSet("GRCh37-subset")
         referenceSet.populateFromFile(self.fastaFilePath)
         referenceSet.setDescription("Subset of GRCh37 used for demonstration")
-        referenceSet.setNcbiTaxonId(9606)
+        referenceSet.setSpeciesFromJson(
+                '{"id": "9606",'
+                + '"term": "Homo sapiens", "source_name": "NCBI"}')
         for reference in referenceSet.getReferences():
-            reference.setNcbiTaxonId(9606)
+            reference.setSpeciesFromJson(
+                '{"id": "9606",'
+                + '"term": "Homo sapiens", "source_name": "NCBI"}')
             reference.setSourceAccessions(
                 self.accessions[reference.getName()] + ".subset")
         repo.insertReferenceSet(referenceSet)