Skip to content
This repository has been archived by the owner on Jan 24, 2018. It is now read-only.

Commit

Permalink
612 taxon id (#1513)
Browse files Browse the repository at this point in the history
* changed server, need to update tests

* changed all references to taxonId

* client tests not finding species

* Ontology term updates

* constraints not working

* code changed, need doc and compliance changes

* updated docs

* fixed json decoding error

* Move client requirement to master
  • Loading branch information
ejacox authored and david4096 committed Feb 2, 2017
1 parent c14ccad commit f7a9990
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 1,235 deletions.
7 changes: 4 additions & 3 deletions docs/datarepo.rst
Expand Up @@ -136,7 +136,7 @@ Adds a reference set derived from a FASTA file to a repository. Each
record in the FASTA file will correspond to a Reference in the new
ReferenceSet. The input FASTA file must be compressed with ``bgzip``
and indexed using ``samtools faidx``. Each ReferenceSet contains a
number of metadata values (.e.g. ``ncbiTaxonId``) which can be set
number of metadata values (.e.g. ``species``) which can be set
using command line options.

.. argparse::
Expand All @@ -152,11 +152,12 @@ using command line options.
$ ga4gh_repo add-referenceset registry.db hs37d5.fa.gz \
--description "NCBI37 assembly of the human genome" \
--ncbiTaxonId 9606 --name NCBI37 \
--species '{"id": "9606", "term": "Homo sapiens", "source_name": "NCBI", "source_version: "1.0"}' \
--name NCBI37 \
--sourceUri ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
Adds a reference set used in the 1000 Genomes project using the name
``NCBI37``, also setting the ``ncbiTaxonId`` to 9606 (human).
``NCBI37``, also setting the ``species`` to 9606 (human).

-------------
add-biosample
Expand Down
4 changes: 3 additions & 1 deletion docs/demo.rst
Expand Up @@ -273,7 +273,9 @@ around 3GB. Next, we will add the reference set.
.. code-block:: bash
$ ga4gh_repo add-referenceset registry.db /full/path/to/hs37d5.fa.gz \
-d "NCBI37 assembly of the human genome" --ncbiTaxonId 9606 --name NCBI37 \
-d “NCBI37 assembly of the human genome” \
--species '{"id": "9606", "term": "Homo sapiens", "source_name": "NCBI", "source_version: "1.0"}' \
--name NCBI37 \
--sourceUri "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
A number of optional command line flags have been added. We will be
Expand Down
6 changes: 4 additions & 2 deletions ga4gh/server/cli/repomanager.py
Expand Up @@ -172,7 +172,8 @@ def addReferenceSet(self):
referenceSet = references.HtslibReferenceSet(name)
referenceSet.populateFromFile(filePath)
referenceSet.setDescription(self._args.description)
referenceSet.setNcbiTaxonId(self._args.ncbiTaxonId)
if self._args.species is not None:
referenceSet.setSpeciesFromJson(self._args.species)
referenceSet.setIsDerived(self._args.isDerived)
referenceSet.setAssemblyId(self._args.assemblyId)
sourceAccessions = []
Expand Down Expand Up @@ -766,7 +767,8 @@ def getParser(cls):
cls.addNameOption(addReferenceSetParser, objectType)
cls.addDescriptionOption(addReferenceSetParser, objectType)
addReferenceSetParser.add_argument(
"--ncbiTaxonId", default=None, help="The NCBI Taxon Id")
"--species", default=None,
help="The species ontology term as a JSON string")
addReferenceSetParser.add_argument(
"--isDerived", default=False, type=bool,
help="Indicates if this reference set is derived from another")
Expand Down
96 changes: 66 additions & 30 deletions ga4gh/server/datamodel/references.py
Expand Up @@ -42,7 +42,7 @@ def __init__(self, localId):
self._assemblyId = None
self._description = None
self._isDerived = False
self._ncbiTaxonId = None
self._species = None
self._sourceAccessions = []
self._sourceUri = None

Expand All @@ -61,12 +61,18 @@ def setDescription(self, description):
"""
self._description = description

def setNcbiTaxonId(self, ncbiTaxonId):
def setSpeciesFromJson(self, speciesJson):
"""
Sets the ncbiTaxonId to the specified value. See the documentation
for getNcbiTaxonId for details of this field.
Sets the species, an OntologyTerm, to the specified value, given as
a JSON string.
See the documentation for details of this field.
"""
self._ncbiTaxonId = ncbiTaxonId
try:
parsed = protocol.fromJson(speciesJson, protocol.OntologyTerm)
except:
raise exceptions.InvalidJsonException(speciesJson)
self._species = protocol.toJsonDict(parsed)

def setIsDerived(self, isDerived):
"""
Expand Down Expand Up @@ -176,17 +182,21 @@ def getSourceUri(self):
"""
return self._sourceUri

def getNcbiTaxonId(self):
def getSpecies(self):
"""
Returns the NCBI Taxon ID for this reference set. This is the
ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human)
indicating the species which this assembly is intended to model.
Returns the species for this reference set. This is the
ontology term with data from
www.obofoundry.org/ontology/ncbitaxon.html
(e.g. 9606 for human)
Note that contained `Reference`s may specify a different
`ncbiTaxonId`, as assemblies may contain reference sequences
which do not belong to the modeled species, e.g. EBV in a
species, as assemblies may contain reference sequences
which do not belong to the modeled species, e.g. EBV in a
human reference genome.
"""
return self._ncbiTaxonId
if self._species is not {}:
return self._species
else:
return None

def toProtocolElement(self):
"""
Expand All @@ -198,7 +208,13 @@ def toProtocolElement(self):
ret.id = self.getId()
ret.is_derived = self.getIsDerived()
ret.md5checksum = self.getMd5Checksum()
ret.ncbi_taxon_id = pb.int(self.getNcbiTaxonId())
if self.getSpecies():
term = protocol.fromJson(
json.dumps(self.getSpecies()), protocol.OntologyTerm)
ret.species.id = term.id
ret.species.term = term.term
ret.species.source_name = term.source_name
ret.species.source_version = term.source_version
ret.source_accessions.extend(self.getSourceAccessions())
ret.source_uri = pb.string(self.getSourceUri())
ret.name = self.getLocalId()
Expand All @@ -222,20 +238,26 @@ def __init__(self, parentContainer, localId):
self._sourceAccessions = []
self._isDerived = False
self._sourceDivergence = pb.DEFAULT_INT
self._ncbiTaxonId = pb.DEFAULT_INT
self._species = None

def setMd5checksum(self, md5checksum):
"""
Sets the md5checksum to the specified value.
"""
self._md5checksum = md5checksum

def setNcbiTaxonId(self, ncbiTaxonId):
def setSpeciesFromJson(self, speciesJson):
"""
Sets the ncbiTaxonId to the specified value. See the documentation
for getNcbiTaxonId for details of this field.
Sets the species, an OntologyTerm, to the specified value, given as
a JSON string.
See the documentation for details of this field.
"""
self._ncbiTaxonId = ncbiTaxonId
try:
parsed = protocol.fromJson(speciesJson, protocol.OntologyTerm)
except:
raise exceptions.InvalidJsonException(speciesJson)
self._species = protocol.toJsonDict(parsed)

def setSourceAccessions(self, sourceAccessions):
"""
Expand Down Expand Up @@ -296,17 +318,21 @@ def getSourceUri(self):
"""
return self._sourceUri

def getNcbiTaxonId(self):
def getSpecies(self):
"""
Returns the NCBI Taxon ID for this reference. This is the
ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human)
indicating the species which this assembly is intended to model.
Returns the species for this reference set. This is the
ontology term with data from
www.obofoundry.org/ontology/ncbitaxon.html
(e.g. 9606 for human)
Note that contained `Reference`s may specify a different
`ncbiTaxonId`, as assemblies may contain reference sequences
which do not belong to the modeled species, e.g. EBV in a
species, as assemblies may contain reference sequences
which do not belong to the modeled species, e.g. EBV in a
human reference genome.
"""
return self._ncbiTaxonId
if self._species is not {}:
return self._species
else:
return None

def getMd5Checksum(self):
"""
Expand All @@ -326,7 +352,13 @@ def toProtocolElement(self):
reference.length = self.getLength()
reference.md5checksum = self.getMd5Checksum()
reference.name = self.getName()
reference.ncbi_taxon_id = self.getNcbiTaxonId()
if self.getSpecies():
term = protocol.fromJson(
json.dumps(self.getSpecies()), protocol.OntologyTerm)
reference.species.id = term.id
reference.species.term = term.term
reference.species.source_name = term.source_name
reference.species.source_version = term.source_version
reference.source_accessions.extend(self.getSourceAccessions())
reference.source_divergence = pb.int(self.getSourceDivergence())
reference.source_uri = self.getSourceUri()
Expand Down Expand Up @@ -370,7 +402,9 @@ def __init__(self, localId, randomSeed=0, numReferences=1):
self._description = "Simulated reference set"
self._assemblyId = str(random.randint(0, 2**32))
self._isDerived = bool(random.randint(0, 1))
self._ncbiTaxonId = random.randint(0, 2**16)
self._species = json.loads(
'{"sourceName": "NCBI", "sourceVersion": "",'
+ '"term": "Homo sapiens", "id": "9606"}')
self._sourceAccessions = []
for i in range(random.randint(1, 3)):
self._sourceAccessions.append("sim_accession_{}".format(
Expand Down Expand Up @@ -402,7 +436,9 @@ def __init__(self, parentContainer, localId, randomSeed=0, length=200):
self._sourceDivergence = 0
if self._isDerived:
self._sourceDivergence = rng.uniform(0, 0.1)
self._ncbiTaxonId = random.randint(0, 2**16)
self._species = json.loads(
'{"sourceName": "NCBI", "sourceVersion": "",'
+ '"term": "Homo sapiens", "id": "9606"}')
self._sourceAccessions = []
for i in range(random.randint(1, 3)):
self._sourceAccessions.append("sim_accession_{}".format(
Expand Down Expand Up @@ -455,7 +491,7 @@ def populateFromRow(self, row):
self._assemblyId = row[b'assemblyId']
self._isDerived = bool(row[b'isDerived'])
self._md5checksum = row[b'md5checksum']
self._ncbiTaxonId = row[b'ncbiTaxonId']
self._species = json.loads(row[b'species'])
self._sourceAccessions = json.loads(row[b'sourceAccessions'])
self._sourceUri = row[b'sourceUri']

Expand Down Expand Up @@ -490,7 +526,7 @@ def populateFromRow(self, row):
self._length = row[b'length']
self._isDerived = bool(row[b'isDerived'])
self._md5checksum = row[b'md5checksum']
self._ncbiTaxonId = row[b'ncbiTaxonId']
self._species = json.loads(row[b'species'])
self._sourceAccessions = json.loads(row[b'sourceAccessions'])
self._sourceDivergence = row[b'sourceDivergence']
self._sourceUri = row[b'sourceUri']
Expand Down
12 changes: 6 additions & 6 deletions ga4gh/server/datarepo.py
Expand Up @@ -705,7 +705,7 @@ def _createReferenceTable(self, cursor):
length INTEGER,
isDerived INTEGER,
md5checksum TEXT,
ncbiTaxonId INTEGER,
species TEXT,
sourceAccessions TEXT,
sourceDivergence REAL,
sourceUri TEXT,
Expand All @@ -723,15 +723,15 @@ def insertReference(self, reference):
sql = """
INSERT INTO Reference (
id, referenceSetId, name, length, isDerived, md5checksum,
ncbiTaxonId, sourceAccessions, sourceUri)
species, sourceAccessions, sourceUri)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);
"""
cursor = self._dbConnection.cursor()
cursor.execute(sql, (
reference.getId(), reference.getParentContainer().getId(),
reference.getLocalId(), reference.getLength(),
reference.getIsDerived(), reference.getMd5Checksum(),
reference.getNcbiTaxonId(),
json.dumps(reference.getSpecies()),
# We store the list of sourceAccessions as a JSON string. Perhaps
# this should be another table?
json.dumps(reference.getSourceAccessions()),
Expand All @@ -756,7 +756,7 @@ def _createReferenceSetTable(self, cursor):
assemblyId TEXT,
isDerived INTEGER,
md5checksum TEXT,
ncbiTaxonId INTEGER,
species TEXT,
sourceAccessions TEXT,
sourceUri TEXT,
dataUrl TEXT NOT NULL,
Expand All @@ -772,7 +772,7 @@ def insertReferenceSet(self, referenceSet):
sql = """
INSERT INTO ReferenceSet (
id, name, description, assemblyId, isDerived, md5checksum,
ncbiTaxonId, sourceAccessions, sourceUri, dataUrl)
species, sourceAccessions, sourceUri, dataUrl)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
"""
cursor = self._dbConnection.cursor()
Expand All @@ -781,7 +781,7 @@ def insertReferenceSet(self, referenceSet):
referenceSet.getId(), referenceSet.getLocalId(),
referenceSet.getDescription(), referenceSet.getAssemblyId(),
referenceSet.getIsDerived(), referenceSet.getMd5Checksum(),
referenceSet.getNcbiTaxonId(),
json.dumps(referenceSet.getSpecies(), protocol.OntologyTerm),
# We store the list of sourceAccessions as a JSON string.
# Perhaps this should be another table?
json.dumps(referenceSet.getSourceAccessions()),
Expand Down
8 changes: 6 additions & 2 deletions scripts/download_example_data.py
Expand Up @@ -346,9 +346,13 @@ def createRepo(self):
referenceSet = references.HtslibReferenceSet("GRCh37-subset")
referenceSet.populateFromFile(self.fastaFilePath)
referenceSet.setDescription("Subset of GRCh37 used for demonstration")
referenceSet.setNcbiTaxonId(9606)
referenceSet.setSpeciesFromJson(
'{"id": "9606",'
+ '"term": "Homo sapiens", "source_name": "NCBI"}')
for reference in referenceSet.getReferences():
reference.setNcbiTaxonId(9606)
reference.setSpeciesFromJson(
'{"id": "9606",'
+ '"term": "Homo sapiens", "source_name": "NCBI"}')
reference.setSourceAccessions(
self.accessions[reference.getName()] + ".subset")
repo.insertReferenceSet(referenceSet)
Expand Down

0 comments on commit f7a9990

Please sign in to comment.