# Towards a DOS Serializer

This notebook builds a schema and instance of a serializer for generating DataBundles suitable for loading into a Data Object Service.

First we take a version of the Data Object schema from the DOS API here: https://github.com/ga4gh/data-object-service-schemas/blob/master/openapi/data_object_service.swagger.yaml#L594

With some small modifications: https://github.com/ga4gh/data-object-service-schemas/issues/82

In [2]:
data_object_schema = {
    'type': 'object',
    'properties': {
      'id': {
        'type': 'string',
        'description': 'REQUIRED\nAn identifier unique to this Data Object.'
      },
      'name': {
        'type': 'string',
        'description': 'OPTIONAL\nA string that can be optionally used to name a Data Object.'
      },
      'size': {
        'type': 'string',
        'format': 'int64',
        'description': 'REQUIRED\nThe computed size in bytes.'
      },
      'created': {
        'type': 'string',
        'format': 'date-time',
        'description': 'REQUIRED\nTimestamp of object creation in RFC3339.'
      },
      'updated': {
        'type': 'string',
        'format': 'date-time',
        'description': 'OPTIONAL\nTimestamp of update in RFC3339, identical to create timestamp in systems\nthat do not support updates.'
      },
      'version': {
        'type': 'string',
        'description': 'OPTIONAL\nA string representing a version.'
      },
      'mime_type': {
        'type': 'string',
        'description': 'OPTIONAL\nA string providing the mime-type of the Data Object.\nFor example, \'application/json\'.'
      },
      'checksums': {
        'type': 'array',
        'items': {
              "type": "object",
              "properties": {
                "checksum": {
                  "type": "string",
                  "description": "REQUIRED\nThe hex-string encoded checksum for the Data."
                },
                "type": {
                  "type": "string",
                  "description": "OPTIONAL\nThe digest method used to create the checksum. If left unspecified md5\nwill be assumed.\n\npossible values:\nmd5                # most blob stores provide a checksum using this\nmultipart-md5      # multipart uploads provide a specialized tag in S3\nsha256\nsha512"
                }
              }
            },

        'description': 'REQUIRED\nThe checksum of the Data Object. At least one checksum must be provided.'
      },
      'urls': {
        'type': 'array',
            'items': {
            "type": "object",
            "properties": {
              "url": {
                "type": "string",
                "description": "REQUIRED\nA URL that can be used to access the file."
              },
              "system_metadata": {
                  "type": "object",
                  "additionalProperties": True,
                  "description": "OPTIONAL\nThese values are reported by the underlying object store.\nA set of key-value pairs that represent system metadata about the object."
              },
              "user_metadata": {
                  "type": "object",
                  "additionalProperties": True,
                  "description": "OPTIONAL\nA set of key-value pairs that represent metadata provided by the uploader."
                }
            }
        },
        'description': 'OPTIONAL\nThe list of URLs that can be used to access the Data Object.'
      },
      'description': {
        'type': 'string',
        'description': 'OPTIONAL\nA human readable description of the contents of the Data Object.'
      },
      'aliases': {
        'type': 'array',
        'items': {
          'type': 'string'
        },
        'description': "OPTIONAL\nA list of strings that can be used to find this Data Object.\nThese aliases can be used to represent the Data Object's location in\na directory (e.g. \'bucket/folder/file.name\') to make Data Objects\nmore discoverable."
      }
    }
  }




Now the same thing for a Data Bundle.

In [96]:
data_bundle = {
      "type": "object",
      "properties": {
        "id": {
          "type": "string",
          "description": "REQUIRED\nAn identifier, unique to this Data Bundle"
        },
        "data_object_ids": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "REQUIRED\nThe list of Data Objects that this Data Bundle contains."
        },
        "created": {
          "type": "string",
          "format": "date-time",
          "description": "REQUIRED\nTimestamp of object creation in RFC3339."
        },
        "updated": {
          "type": "string",
          "format": "date-time",
          "description": "REQUIRED\nTimestamp of update in RFC3339, identical to create timestamp in systems\nthat do not support updates."
        },
        "version": {
          "type": "string",
          "description": "REQUIRED\nA string representing a version, some systems may use checksum, a RFC3339\ntimestamp, or incrementing version number. For systems that do not support\nversioning please use your update timestamp as your version."
        },
        'checksums': {
            'type': 'array',
            'items': {
                  "type": "object",
                  "properties": {
                    "checksum": {
                      "type": "string",
                      "description": "REQUIRED\nThe hex-string encoded checksum for the Data."
                    },
                    "type": {
                      "type": "string",
                      "description": "OPTIONAL\nThe digest method used to create the checksum. If left unspecified md5\nwill be assumed.\n\npossible values:\nmd5                # most blob stores provide a checksum using this\nmultipart-md5      # multipart uploads provide a specialized tag in S3\nsha256\nsha512"
                    }
                  }
                },
        "description": "REQUIRED\nAt least one checksum must be provided.\nThe data bundle checksum is computed over all the checksums of the\nData Objects that bundle contains."
      },
        "description": {
          "type": "string",
          "description": "OPTIONAL\nA human readable description."
        },
        "aliases": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "OPTIONAL\nA list of strings that can be used to identify this Data Bundle."
        },
        "system_metadata": {
          "type": "object",
          "additionalProperties": True,
          "description": "OPTIONAL\nThese values are reported by the underlying object store.\nA set of key-value pairs that represent system metadata about the object."
        },
        "user_metadata": {
              "type": "object",
              "additionalProperties": True,
              "description": "OPTIONAL\nA set of key-value pairs that represent metadata provided by the uploader."
          }
        }
    
}

## Putting them together

We can now make a simple schema that combines them. It assumes that all Data Objects defined in Data Bundles are defined in the Data Objects map.

In [97]:
schema = {
    'type': 'object',
    'properties': {
        'data_objects': {
            'type': 'object',
             'additionalProperties': {'type': data_object_schema},
             'description': 'A map of Data Object Identifiers to Data Objects'
        },
        'data_bundles': {
            'type': 'object',
              "additionalProperties": data_bundle,
             'description': 'A map of Data Bundle Identifiers to Data Bundle'
        }
    },
}

## Making an instance

Now we can make an instance of our schema with the help of the DOS client.

In [98]:
from ga4gh.dos.client import Client

In [99]:
c = Client("https://dos-gdc.ucsc-cgp-dev.org")

In [100]:
client = c.client
models = c.models

In [101]:
DataBundle = models.get_model('DataBundle')
DataObject = models.get_model('DataObject')

In [102]:
my_data_bundle = DataBundle(id='test-id')
my_data_bundle.version = 'foo' # FIXME shouldn't validate
print(my_data_bundle.marshal())

{u'version': 'foo', u'id': 'test-id'}


In [103]:
instance = {
    'data_bundles': {
        'test': DataBundle(id='test-bundle', data_object_ids=['test-object']).marshal()
    },
    'data_objects': {
        'test-object': DataObject(id='test-object').marshal()
    }
}

In [104]:
import json
print(json.dumps(instance, indent=4))

{
    "data_bundles": {
        "test": {
            "id": "test-bundle", 
            "data_object_ids": [
                "test-object"
            ]
        }
    }, 
    "data_objects": {
        "test-object": {
            "id": "test-object"
        }
    }
}


## JSON Schemas validation

Let's try to do some simple validation.

In [105]:
import jsonschema

In [106]:
schema_validator = jsonschema.Draft3Validator

In [108]:
schema_validator.check_schema(schema)

SchemaError: {'type': {'type': 'object', 'properties': {'updated': {'type': 'string', 'description': 'OPTIONAL\nTimestamp of update in RFC3339, identical to create timestamp in systems\nthat do not support updates.', 'format': 'date-time'}, 'description': {'type': 'string', 'description': 'OPTIONAL\nA human readable description of the contents of the Data Object.'}, 'checksums': {'items': {'type': 'object', 'properties': {'checksum': {'type': 'string', 'description': 'REQUIRED\nThe hex-string encoded checksum for the Data.'}, 'type': {'type': 'string', 'description': 'OPTIONAL\nThe digest method used to create the checksum. If left unspecified md5\nwill be assumed.\n\npossible values:\nmd5                # most blob stores provide a checksum using this\nmultipart-md5      # multipart uploads provide a specialized tag in S3\nsha256\nsha512'}}}, 'type': 'array', 'description': 'REQUIRED\nThe checksum of the Data Object. At least one checksum must be provided.'}, 'id': {'type': 'string', 'description': 'REQUIRED\nAn identifier unique to this Data Object.'}, 'aliases': {'items': {'type': 'string'}, 'type': 'array', 'description': "OPTIONAL\nA list of strings that can be used to find this Data Object.\nThese aliases can be used to represent the Data Object's location in\na directory (e.g. 'bucket/folder/file.name') to make Data Objects\nmore discoverable."}, 'name': {'type': 'string', 'description': 'OPTIONAL\nA string that can be optionally used to name a Data Object.'}, 'created': {'type': 'string', 'description': 'REQUIRED\nTimestamp of object creation in RFC3339.', 'format': 'date-time'}, 'version': {'type': 'string', 'description': 'OPTIONAL\nA string representing a version.'}, 'urls': {'items': {'type': 'object', 'properties': {'url': {'type': 'string', 'description': 'REQUIRED\nA URL that can be used to access the file.'}, 'system_metadata': {'additionalProperties': True, 'type': 'object', 'description': 'OPTIONAL\nThese values are reported by the underlying object store.\nA set of key-value pairs that represent system metadata about the object.'}, 'user_metadata': {'additionalProperties': True, 'type': 'object', 'description': 'OPTIONAL\nA set of key-value pairs that represent metadata provided by the uploader.'}}}, 'type': 'array', 'description': 'OPTIONAL\nThe list of URLs that can be used to access the Data Object.'}, 'size': {'type': 'string', 'description': 'REQUIRED\nThe computed size in bytes.', 'format': 'int64'}, 'mime_type': {'type': 'string', 'description': "OPTIONAL\nA string providing the mime-type of the Data Object.\nFor example, 'application/json'."}}}} is not of type {u'$ref': u'#'}, u'boolean'

Failed validating u'type' in schema[u'properties'][u'properties'][u'additionalProperties'][u'properties'][u'additionalProperties']:
    {u'default': {}, u'type': [{u'$ref': u'#'}, u'boolean']}

On instance[u'properties']['data_objects'][u'additionalProperties']:
    {'type': {'properties': {'aliases': {'description': "OPTIONAL\nA list of strings that can be used to find this Data Object.\nThese aliases can be used to represent the Data Object's location in\na directory (e.g. 'bucket/folder/file.name') to make Data Objects\nmore discoverable.",
                                         'items': {'type': 'string'},
                                         'type': 'array'},
                             'checksums': {'description': 'REQUIRED\nThe checksum of the Data Object. At least one checksum must be provided.',
                                           'items': {'properties': {'checksum': {'description': 'REQUIRED\nThe hex-string encoded checksum for the Data.',
                                                                                 'type': 'string'},
                                                                    'type': {'description': 'OPTIONAL\nThe digest method used to create the checksum. If left unspecified md5\nwill be assumed.\n\npossible values:\nmd5                # most blob stores provide a checksum using this\nmultipart-md5      # multipart uploads provide a specialized tag in S3\nsha256\nsha512',
                                                                             'type': 'string'}},
                                                     'type': 'object'},
                                           'type': 'array'},
                             'created': {'description': 'REQUIRED\nTimestamp of object creation in RFC3339.',
                                         'format': 'date-time',
                                         'type': 'string'},
                             'description': {'description': 'OPTIONAL\nA human readable description of the contents of the Data Object.',
                                             'type': 'string'},
                             'id': {'description': 'REQUIRED\nAn identifier unique to this Data Object.',
                                    'type': 'string'},
                             'mime_type': {'description': "OPTIONAL\nA string providing the mime-type of the Data Object.\nFor example, 'application/json'.",
                                           'type': 'string'},
                             'name': {'description': 'OPTIONAL\nA string that can be optionally used to name a Data Object.',
                                      'type': 'string'},
                             'size': {'description': 'REQUIRED\nThe computed size in bytes.',
                                      'format': 'int64',
                                      'type': 'string'},
                             'updated': {'description': 'OPTIONAL\nTimestamp of update in RFC3339, identical to create timestamp in systems\nthat do not support updates.',
                                         'format': 'date-time',
                                         'type': 'string'},
                             'urls': {'description': 'OPTIONAL\nThe list of URLs that can be used to access the Data Object.',
                                      'items': {'properties': {'system_metadata': {'additionalProperties': True,
                                                                                   'description': 'OPTIONAL\nThese values are reported by the underlying object store.\nA set of key-value pairs that represent system metadata about the object.',
                                                                                   'type': 'object'},
                                                               'url': {'description': 'REQUIRED\nA URL that can be used to access the file.',
                                                                       'type': 'string'},
                                                               'user_metadata': {'additionalProperties': True,
                                                                                 'description': 'OPTIONAL\nA set of key-value pairs that represent metadata provided by the uploader.',
                                                                                 'type': 'object'}},
                                                'type': 'object'},
                                      'type': 'array'},
                             'version': {'description': 'OPTIONAL\nA string representing a version.',
                                         'type': 'string'}},
              'type': 'object'}}

In [None]:
instance_validator = jsonschema.Draft3Validator(schema)

In [81]:
instance_validator.validate(instance)

NameError: name 'instance_validator' is not defined